Jalajk commited on
Commit
50317d2
1 Parent(s): c83f68f

Upload Index.py

Browse files
Files changed (1) hide show
  1. Index.py +236 -0
Index.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ # from transformers import pipeline
4
+ from txtai.embeddings import Embeddings
5
+ from txtai.pipeline import Extractor
6
+ from langchain.document_loaders import WebBaseLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+
9
+ from langchain import HuggingFaceHub
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.chains import LLMChain
12
+ from txtai.embeddings import Embeddings
13
+ from txtai.pipeline import Extractor
14
+
15
+ import pandas as pd
16
+ import sqlite3
17
+ import os
18
+
19
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
20
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
21
+ app = FastAPI(docs_url="/")
22
+ # app = FastAPI()
23
+
24
+ # pipe = pipeline("text2text-generation", model="google/flan-t5-small")
25
+
26
+
27
+ # @app.get("/generate")
28
+ # def generate(text: str):
29
+ # """
30
+ # Using the text2text-generation pipeline from `transformers`, generate text
31
+ # from the given input text. The model used is `google/flan-t5-small`, which
32
+ # can be found [here](https://huggingface.co/google/flan-t5-small).
33
+ # """
34
+ # output = pipe(text)
35
+ # return {"output": output[0]["generated_text"]}
36
+
37
+
38
+ def load_embeddings(
39
+ domain: str = "",
40
+ db_present: bool = True,
41
+ path: str = "sentence-transformers/all-MiniLM-L6-v2",
42
+ index_name: str = "index",
43
+ ):
44
+ # Create embeddings model with content support
45
+ embeddings = Embeddings({"path": path, "content": True})
46
+
47
+ # if Vector DB is not present
48
+ if not db_present:
49
+ return embeddings
50
+ else:
51
+ if domain == "":
52
+ embeddings.load(index_name) # change this later
53
+ else:
54
+ print(3)
55
+ embeddings.load(f"{index_name}/{domain}")
56
+ return embeddings
57
+
58
+
59
+ def _check_if_db_exists(db_path: str) -> bool:
60
+ return os.path.exists(db_path)
61
+
62
+
63
+ def _text_splitter(doc):
64
+ text_splitter = RecursiveCharacterTextSplitter(
65
+ chunk_size=500,
66
+ chunk_overlap=50,
67
+ length_function=len,
68
+ )
69
+ return text_splitter.transform_documents(doc)
70
+
71
+
72
+ def _load_docs(path: str):
73
+ load_doc = WebBaseLoader(path).load()
74
+ doc = _text_splitter(load_doc)
75
+ return doc
76
+
77
+
78
+ def _stream(dataset, limit, index: int = 0):
79
+ for row in dataset:
80
+ yield (index, row.page_content, None)
81
+ index += 1
82
+
83
+ if index >= limit:
84
+ break
85
+
86
+
87
+ def _max_index_id(path):
88
+ db = sqlite3.connect(path)
89
+
90
+ table = "sections"
91
+ df = pd.read_sql_query(f"select * from {table}", db)
92
+ return {"max_index": df["indexid"].max()}
93
+
94
+
95
+ def _upsert_docs(doc, embeddings, vector_doc_path: str, db_present: bool):
96
+ print(vector_doc_path)
97
+ if db_present:
98
+ print(1)
99
+ max_index = _max_index_id(f"{vector_doc_path}/documents")
100
+ print(max_index)
101
+ embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
102
+ print("Embeddings done!!")
103
+ embeddings.save(vector_doc_path)
104
+ print("Embeddings done - 1!!")
105
+ else:
106
+ print(2)
107
+ embeddings.index(_stream(doc, 500, 0))
108
+ embeddings.save(vector_doc_path)
109
+ max_index = _max_index_id(f"{vector_doc_path}/documents")
110
+ print(max_index)
111
+ # check
112
+ # max_index = _max_index_id(f"{vector_doc_path}/documents")
113
+ # print(max_index)
114
+ return max_index
115
+
116
+
117
+ # def prompt(question):
118
+ # return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
119
+ # Question: {question}
120
+ # Context: """
121
+
122
+
123
+ # def search(query, question=None):
124
+ # # Default question to query if empty
125
+ # if not question:
126
+ # question = query
127
+
128
+ # return extractor([("answer", query, prompt(question), False)])[0][1]
129
+
130
+
131
+ # @app.get("/rag")
132
+ # def rag(question: str):
133
+ # # question = "what is the document about?"
134
+ # answer = search(question)
135
+ # # print(question, answer)
136
+ # return {answer}
137
+
138
+
139
+ # @app.get("/index")
140
+ # def get_url_file_path(url_path: str):
141
+ # embeddings = load_embeddings()
142
+ # doc = _load_docs(url_path)
143
+ # embeddings, max_index = _upsert_docs(doc, embeddings)
144
+ # return max_index
145
+
146
+
147
+ @app.get("/index/{domain}/")
148
+ def get_domain_file_path(domain: str, file_path: str):
149
+ print(domain, file_path)
150
+ print(os.getcwd())
151
+ bool_value = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
152
+ print(bool_value)
153
+ if bool_value:
154
+ embeddings = load_embeddings(domain=domain, db_present=bool_value)
155
+ print(embeddings)
156
+ doc = _load_docs(file_path)
157
+ max_index = _upsert_docs(
158
+ doc=doc,
159
+ embeddings=embeddings,
160
+ vector_doc_path=f"index/{domain}",
161
+ db_present=bool_value,
162
+ )
163
+ # print("-------")
164
+ else:
165
+ embeddings = load_embeddings(domain=domain, db_present=bool_value)
166
+ doc = _load_docs(file_path)
167
+ max_index = _upsert_docs(
168
+ doc=doc,
169
+ embeddings=embeddings,
170
+ vector_doc_path=f"index/{domain}",
171
+ db_present=bool_value,
172
+ )
173
+ # print("Final - output : ", max_index)
174
+ return "Executed Successfully!!"
175
+
176
+
177
+ def _check_if_db_exists(db_path: str) -> bool:
178
+ return os.path.exists(db_path)
179
+
180
+
181
+ def _load_embeddings_from_db(
182
+ db_present: bool,
183
+ domain: str,
184
+ path: str = "sentence-transformers/all-MiniLM-L6-v2",
185
+ ):
186
+ # Create embeddings model with content support
187
+ embeddings = Embeddings({"path": path, "content": True})
188
+ # if Vector DB is not present
189
+ if not db_present:
190
+ return embeddings
191
+ else:
192
+ if domain == "":
193
+ embeddings.load("index") # change this later
194
+ else:
195
+ print(3)
196
+ embeddings.load(f"index/{domain}")
197
+ return embeddings
198
+
199
+
200
+ def _prompt(question):
201
+ return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
202
+ Question: {question}
203
+ Context: """
204
+
205
+
206
+ def _search(query, extractor, question=None):
207
+ # Default question to query if empty
208
+ if not question:
209
+ question = query
210
+
211
+ # template = f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
212
+ # Question: {question}
213
+ # Context: """
214
+
215
+ # prompt = PromptTemplate(template=template, input_variables=["question"])
216
+ # llm_chain = LLMChain(prompt=prompt, llm=extractor)
217
+
218
+ # return {"question": question, "answer": llm_chain.run(question)}
219
+ return extractor([("answer", query, _prompt(question), False)])[0][1]
220
+
221
+
222
+ @app.get("/rag")
223
+ def rag(domain: str, question: str):
224
+ db_exists = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
225
+ print(db_exists)
226
+ # if db_exists:
227
+ embeddings = _load_embeddings_from_db(db_exists, domain)
228
+ # Create extractor instance
229
+ extractor = Extractor(embeddings, "google/flan-t5-base")
230
+ # llm = HuggingFaceHub(
231
+ # repo_id="google/flan-t5-xxl",
232
+ # model_kwargs={"temperature": 1, "max_length": 1000000},
233
+ # )
234
+ # else:
235
+ answer = _search(question, extractor)
236
+ return {"question": question, "answer": answer}