Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,17 +1,109 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
def save_pdf_to_directory(uploaded_file, directory):
|
5 |
-
if uploaded_file is not None:
|
6 |
-
# Define directory to save file
|
7 |
-
|
8 |
-
if not os.path.exists(directory):
|
9 |
-
os.makedirs(directory)
|
10 |
-
|
11 |
-
# Save uploaded PDF file to directory
|
12 |
-
with open(os.path.join(directory, uploaded_file.name), "wb") as pdf_file:
|
13 |
-
pdf_file.write(uploaded_file.getbuffer())
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
|
|
|
|
17 |
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
3 |
+
from pypdf import PdfReader
|
4 |
+
from langchain.schema import Document
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
7 |
+
from pinecone import Pinecone as PineconeClient
|
8 |
+
from langchain.chains.question_answering import load_qa_chain
|
9 |
+
from datetime import datetime
|
10 |
+
from langchain_community.vectorstores import Pinecone
|
11 |
import os
|
12 |
+
import time
|
13 |
+
|
14 |
+
|
15 |
+
def get_pdf_text(pdf_doc):
|
16 |
+
text = ""
|
17 |
+
pdf_reader = PdfReader(pdf_doc)
|
18 |
+
for page in pdf_reader.pages:
|
19 |
+
text += page.extract_text()
|
20 |
+
return text
|
21 |
+
|
22 |
+
def create_docs(user_pdf_list, unique_id):
|
23 |
+
docs=[]
|
24 |
+
for filename in user_pdf_list:
|
25 |
+
chunks = get_pdf_text(filename)
|
26 |
+
|
27 |
+
docs.append(Document(
|
28 |
+
page_content = chunks,
|
29 |
+
metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
30 |
+
))
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
return docs
|
34 |
+
|
35 |
+
# transform documents
|
36 |
+
def split_docs(documents, chunk_size=400, chunk_overlap=20):
|
37 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
38 |
+
docs = text_splitter.split_documents(documents)
|
39 |
+
|
40 |
+
return docs
|
41 |
+
|
42 |
+
def get_embeddings():
|
43 |
+
embedding = OpenAIEmbeddings()
|
44 |
+
return embedding
|
45 |
+
|
46 |
+
|
47 |
+
def push_to_pinecone(docs, embedding):
|
48 |
+
|
49 |
+
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
|
50 |
+
index_name=os.environ.get("PINECONE_INDEX_NAME")
|
51 |
+
index = pc.Index(index_name)
|
52 |
+
|
53 |
+
index.delete(delete_all=True, namespace='rag_bot')
|
54 |
+
|
55 |
+
vector = []
|
56 |
+
for i, doc in enumerate(docs):
|
57 |
+
entry = { "id": str(i),
|
58 |
+
"values": embedding.embed_query(doc.page_content),
|
59 |
+
"metadata":doc.metadata}
|
60 |
+
vector.append(entry)
|
61 |
+
|
62 |
+
|
63 |
+
index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')
|
64 |
+
|
65 |
+
st.sidebar.write("This 30 seconds delay is added Manually... \n(because I'm using some free resources)")
|
66 |
+
time.sleep(30)
|
67 |
+
|
68 |
+
return index
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
#Function to pull index data from Pinecone
|
73 |
+
def pull_from_pinecone(embeddings):
|
74 |
+
|
75 |
+
pinecone_apikey = os.environ.get("PINECONE_API_KEY")
|
76 |
+
pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")
|
77 |
+
|
78 |
+
PineconeClient(
|
79 |
+
api_key=pinecone_apikey
|
80 |
+
)
|
81 |
+
|
82 |
+
#PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
|
83 |
+
index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')
|
84 |
+
|
85 |
+
return index
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
def get_similar_doc(query, embedding,k=2):
|
91 |
+
|
92 |
+
pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
|
93 |
+
index_name=os.environ.get("PINECONE_INDEX_NAME")
|
94 |
+
index = pc.Index(index_name)
|
95 |
+
|
96 |
+
index = pull_from_pinecone(embeddings=embedding)
|
97 |
+
similar_doc = index.similarity_search_with_score(query, int(k))
|
98 |
+
|
99 |
+
return [doc for doc, similarity_score in similar_doc]
|
100 |
+
|
101 |
+
|
102 |
|
103 |
+
def get_answer(query, embedding, k=2):
|
104 |
+
llm=ChatOpenAI(temperature=0.5)
|
105 |
+
chain = load_qa_chain(llm, chain_type="stuff")
|
106 |
|
107 |
+
relevent_doc = get_similar_doc(query, embedding,k=2)
|
108 |
+
response = chain.run(input_documents = relevent_doc, question=query)
|
109 |
+
return response
|