abaliyan commited on
Commit
13d8e3b
1 Parent(s): bef7bef

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +103 -11
utils.py CHANGED
@@ -1,17 +1,109 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- def save_pdf_to_directory(uploaded_file, directory):
5
- if uploaded_file is not None:
6
- # Define directory to save file
7
-
8
- if not os.path.exists(directory):
9
- os.makedirs(directory)
10
-
11
- # Save uploaded PDF file to directory
12
- with open(os.path.join(directory, uploaded_file.name), "wb") as pdf_file:
13
- pdf_file.write(uploaded_file.getbuffer())
14
 
15
- st.success(f"File '{uploaded_file.name}' saved successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
17
 
 
 
 
 
1
  import streamlit as st
2
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
3
+ from pypdf import PdfReader
4
+ from langchain.schema import Document
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
+ from pinecone import Pinecone as PineconeClient
8
+ from langchain.chains.question_answering import load_qa_chain
9
+ from datetime import datetime
10
+ from langchain_community.vectorstores import Pinecone
11
  import os
12
+ import time
13
+
14
+
15
+ def get_pdf_text(pdf_doc):
16
+ text = ""
17
+ pdf_reader = PdfReader(pdf_doc)
18
+ for page in pdf_reader.pages:
19
+ text += page.extract_text()
20
+ return text
21
+
22
+ def create_docs(user_pdf_list, unique_id):
23
+ docs=[]
24
+ for filename in user_pdf_list:
25
+ chunks = get_pdf_text(filename)
26
+
27
+ docs.append(Document(
28
+ page_content = chunks,
29
+ metadata = {"name":filename.name, "type=": filename.type, "size": filename.size, "unique_id": unique_id, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
30
+ ))
31
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ return docs
34
+
35
+ # transform documents
36
+ def split_docs(documents, chunk_size=400, chunk_overlap=20):
37
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
38
+ docs = text_splitter.split_documents(documents)
39
+
40
+ return docs
41
+
42
+ def get_embeddings():
43
+ embedding = OpenAIEmbeddings()
44
+ return embedding
45
+
46
+
47
+ def push_to_pinecone(docs, embedding):
48
+
49
+ pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
50
+ index_name=os.environ.get("PINECONE_INDEX_NAME")
51
+ index = pc.Index(index_name)
52
+
53
+ index.delete(delete_all=True, namespace='rag_bot')
54
+
55
+ vector = []
56
+ for i, doc in enumerate(docs):
57
+ entry = { "id": str(i),
58
+ "values": embedding.embed_query(doc.page_content),
59
+ "metadata":doc.metadata}
60
+ vector.append(entry)
61
+
62
+
63
+ index = Pinecone.from_documents(docs, embedding, index_name = index_name, namespace='rag_bot')
64
+
65
+ st.sidebar.write("This 30 seconds delay is added Manually... \n(because I'm using some free resources)")
66
+ time.sleep(30)
67
+
68
+ return index
69
+
70
+
71
+
72
+ #Function to pull index data from Pinecone
73
+ def pull_from_pinecone(embeddings):
74
+
75
+ pinecone_apikey = os.environ.get("PINECONE_API_KEY")
76
+ pinecone_index_name =os.environ.get("PINECONE_INDEX_NAME")
77
+
78
+ PineconeClient(
79
+ api_key=pinecone_apikey
80
+ )
81
+
82
+ #PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
83
+ index = Pinecone.from_existing_index(pinecone_index_name, embeddings, namespace='rag_bot')
84
+
85
+ return index
86
+
87
+
88
+
89
+
90
+ def get_similar_doc(query, embedding,k=2):
91
+
92
+ pc = PineconeClient(api_key=os.environ.get("PINECONE_API_KEY"))
93
+ index_name=os.environ.get("PINECONE_INDEX_NAME")
94
+ index = pc.Index(index_name)
95
+
96
+ index = pull_from_pinecone(embeddings=embedding)
97
+ similar_doc = index.similarity_search_with_score(query, int(k))
98
+
99
+ return [doc for doc, similarity_score in similar_doc]
100
+
101
+
102
 
103
+ def get_answer(query, embedding, k=2):
104
+ llm=ChatOpenAI(temperature=0.5)
105
+ chain = load_qa_chain(llm, chain_type="stuff")
106
 
107
+ relevent_doc = get_similar_doc(query, embedding,k=2)
108
+ response = chain.run(input_documents = relevent_doc, question=query)
109
+ return response