rag-tool

Sleeping

App Files Files Community

Chris4K commited on Jan 20

Commit

b08204e

•

1 Parent(s): 6976271

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -18

app.py CHANGED Viewed

@@ -7,34 +7,94 @@ from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import CharacterTextSplitter
 # Load environment variables
-load_dotenv()
-# Load and process the PDF files
-loader = PyPDFLoader("./new_papers/ALiBi.pdf")
-documents = loader.load()
-# Split the documents into chunks and embed them using HuggingFaceBgeEmbeddings
-text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
-vdocuments = text_splitter.split_documents(documents)
-# Extract the text from the Document objects
-docs_text = [doc.text for doc in vdocuments]
-model = "BAAI/bge-base-en-v1.5"
-encode_kwargs = {
-    "normalize_embeddings": True
-}  # set True to compute cosine similarity
-embeddings = HuggingFaceBgeEmbeddings(
-    model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
-)
-# Create FAISS vector store for API embeddings
-api_db = FAISS.from_texts(texts=docs_text, embedding=embeddings)
 # Define the PDF retrieval function
 def pdf_retrieval(query):
     # Run the query through the retriever
     response = api_db.similarity_search(query)
     return response
 # Create Gradio interface for the API retriever

 from langchain.text_splitter import CharacterTextSplitter
 # Load environment variables
+#load_dotenv()
+def get_pdf_text(pdf_docs):
+    """
+    Extract text from a list of PDF documents.
+    Parameters
+    ----------
+    pdf_docs : list
+        List of PDF documents to extract text from.
+    Returns
+    -------
+    str
+        Extracted text from all the PDF documents.
+    """
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    """
+    Split the input text into chunks.
+    Parameters
+    ----------
+    text : str
+        The input text to be split.
+    Returns
+    -------
+    list
+        List of text chunks.
+    """
+    text_splitter = CharacterTextSplitter(
+        separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_vectorstore(text_chunks):
+    """
+    Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
+    Parameters
+    ----------
+    text_chunks : list
+        List of text chunks to be embedded.
+    Returns
+    -------
+    FAISS
+        A FAISS vector store containing the embeddings of the text chunks.
+    """
+    model = "BAAI/bge-base-en-v1.5"
+    encode_kwargs = {
+        "normalize_embeddings": True
+    }  # set True to compute cosine similarity
+    embeddings = HuggingFaceBgeEmbeddings(
+        model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
+    )
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    print("-----")
+    print(vectorstore.as_retriever.similarity("What is ALiBi?"))
+    print("-----")
+    return vectorstore
+pdf_text = get_pdf_text("./new_papers/ALiBi.pdf")
+text_chunks = get_text_chunks(pdf_text)
+api_db = get_vectorstore(text_chunks)
 # Define the PDF retrieval function
 def pdf_retrieval(query):
     # Run the query through the retriever
     response = api_db.similarity_search(query)
+    print(response)
     return response
 # Create Gradio interface for the API retriever