Chris4K commited on
Commit
b08204e
1 Parent(s): 6976271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -18
app.py CHANGED
@@ -7,34 +7,94 @@ from langchain.document_loaders import PyPDFLoader
7
  from langchain.text_splitter import CharacterTextSplitter
8
 
9
  # Load environment variables
10
- load_dotenv()
11
 
12
- # Load and process the PDF files
13
- loader = PyPDFLoader("./new_papers/ALiBi.pdf")
14
- documents = loader.load()
15
 
16
- # Split the documents into chunks and embed them using HuggingFaceBgeEmbeddings
17
- text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
18
- vdocuments = text_splitter.split_documents(documents)
19
 
20
- # Extract the text from the Document objects
21
- docs_text = [doc.text for doc in vdocuments]
 
22
 
23
- model = "BAAI/bge-base-en-v1.5"
24
- encode_kwargs = {
25
- "normalize_embeddings": True
26
- } # set True to compute cosine similarity
27
- embeddings = HuggingFaceBgeEmbeddings(
28
- model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Create FAISS vector store for API embeddings
32
- api_db = FAISS.from_texts(texts=docs_text, embedding=embeddings)
33
 
34
  # Define the PDF retrieval function
35
  def pdf_retrieval(query):
36
  # Run the query through the retriever
37
  response = api_db.similarity_search(query)
 
38
  return response
39
 
40
  # Create Gradio interface for the API retriever
 
7
  from langchain.text_splitter import CharacterTextSplitter
8
 
9
  # Load environment variables
10
+ #load_dotenv()
11
 
 
 
 
12
 
 
 
 
13
 
14
+ def get_pdf_text(pdf_docs):
15
+ """
16
+ Extract text from a list of PDF documents.
17
 
18
+ Parameters
19
+ ----------
20
+ pdf_docs : list
21
+ List of PDF documents to extract text from.
22
+
23
+ Returns
24
+ -------
25
+ str
26
+ Extracted text from all the PDF documents.
27
+
28
+ """
29
+ text = ""
30
+ for pdf in pdf_docs:
31
+ pdf_reader = PdfReader(pdf)
32
+ for page in pdf_reader.pages:
33
+ text += page.extract_text()
34
+ return text
35
+
36
+
37
+ def get_text_chunks(text):
38
+ """
39
+ Split the input text into chunks.
40
+
41
+ Parameters
42
+ ----------
43
+ text : str
44
+ The input text to be split.
45
+
46
+ Returns
47
+ -------
48
+ list
49
+ List of text chunks.
50
+
51
+ """
52
+ text_splitter = CharacterTextSplitter(
53
+ separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
54
+ )
55
+ chunks = text_splitter.split_text(text)
56
+ return chunks
57
+
58
+
59
+ def get_vectorstore(text_chunks):
60
+ """
61
+ Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
62
+
63
+ Parameters
64
+ ----------
65
+ text_chunks : list
66
+ List of text chunks to be embedded.
67
+
68
+ Returns
69
+ -------
70
+ FAISS
71
+ A FAISS vector store containing the embeddings of the text chunks.
72
+
73
+ """
74
+ model = "BAAI/bge-base-en-v1.5"
75
+ encode_kwargs = {
76
+ "normalize_embeddings": True
77
+ } # set True to compute cosine similarity
78
+ embeddings = HuggingFaceBgeEmbeddings(
79
+ model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
80
+ )
81
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
82
+ print("-----")
83
+ print(vectorstore.as_retriever.similarity("What is ALiBi?"))
84
+ print("-----")
85
+ return vectorstore
86
+
87
+ pdf_text = get_pdf_text("./new_papers/ALiBi.pdf")
88
+ text_chunks = get_text_chunks(pdf_text)
89
+ api_db = get_vectorstore(text_chunks)
90
 
91
+
 
92
 
93
  # Define the PDF retrieval function
94
  def pdf_retrieval(query):
95
  # Run the query through the retriever
96
  response = api_db.similarity_search(query)
97
+ print(response)
98
  return response
99
 
100
  # Create Gradio interface for the API retriever