DeepVen commited on
Commit
cf24d02
1 Parent(s): 11dbfc3

Delete Venkat.py

Browse files
Files changed (1) hide show
  1. Venkat.py +0 -218
Venkat.py DELETED
@@ -1,218 +0,0 @@
1
- from fastapi import FastAPI
2
-
3
- # from transformers import pipeline
4
- from txtai.embeddings import Embeddings
5
- from txtai.pipeline import Extractor
6
- from langchain.document_loaders import WebBaseLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
-
9
- from langchain import HuggingFaceHub
10
- from langchain.prompts import PromptTemplate
11
- from langchain.chains import LLMChain
12
- from txtai.embeddings import Embeddings
13
- from txtai.pipeline import Extractor
14
-
15
- import pandas as pd
16
- import sqlite3
17
- import os
18
-
19
- # NOTE - we configure docs_url to serve the interactive Docs at the root path
20
- # of the app. This way, we can use the docs as a landing page for the app on Spaces.
21
- app = FastAPI(docs_url="/")
22
- # app = FastAPI()
23
-
24
- # pipe = pipeline("text2text-generation", model="google/flan-t5-small")
25
-
26
-
27
- # @app.get("/generate")
28
- # def generate(text: str):
29
- # """
30
- # Using the text2text-generation pipeline from `transformers`, generate text
31
- # from the given input text. The model used is `google/flan-t5-small`, which
32
- # can be found [here](https://huggingface.co/google/flan-t5-small).
33
- # """
34
- # output = pipe(text)
35
- # return {"output": output[0]["generated_text"]}
36
-
37
-
38
- def load_embeddings(
39
- domain: str = "",
40
- db_present: bool = True,
41
- path: str = "sentence-transformers/all-MiniLM-L6-v2",
42
- index_name: str = "index",
43
- ):
44
- # Create embeddings model with content support
45
- embeddings = Embeddings({"path": path, "content": True})
46
-
47
- # if Vector DB is not present
48
- if not db_present:
49
- return embeddings
50
- else:
51
- if domain == "":
52
- embeddings.load(index_name) # change this later
53
- else:
54
- print(3)
55
- embeddings.load(f"{index_name}/{domain}")
56
- return embeddings
57
-
58
-
59
- def _check_if_db_exists(db_path: str) -> bool:
60
- return os.path.exists(db_path)
61
-
62
-
63
- def _text_splitter(doc):
64
- text_splitter = RecursiveCharacterTextSplitter(
65
- chunk_size=500,
66
- chunk_overlap=50,
67
- length_function=len,
68
- )
69
- return text_splitter.transform_documents(doc)
70
-
71
-
72
- def _load_docs(path: str):
73
- load_doc = WebBaseLoader(path).load()
74
- doc = _text_splitter(load_doc)
75
- return doc
76
-
77
-
78
- def _stream(dataset, limit, index: int = 0):
79
- for row in dataset:
80
- yield (index, row.page_content, None)
81
- index += 1
82
-
83
- if index >= limit:
84
- break
85
-
86
-
87
- def _max_index_id(path):
88
- db = sqlite3.connect(path)
89
-
90
- table = "sections"
91
- df = pd.read_sql_query(f"select * from {table}", db)
92
- return {"max_index": df["indexid"].max()}
93
-
94
-
95
- def _upsert_docs(doc, embeddings, vector_doc_path: str, db_present: bool):
96
- print(vector_doc_path)
97
- if db_present:
98
- print(1)
99
- max_index = _max_index_id(f"{vector_doc_path}/documents")
100
- print(max_index)
101
- embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
102
- print("Embeddings done!!")
103
- embeddings.save(vector_doc_path)
104
- print("Embeddings done - 1!!")
105
- else:
106
- print(2)
107
- embeddings.index(_stream(doc, 500, 0))
108
- embeddings.save(vector_doc_path)
109
- max_index = _max_index_id(f"{vector_doc_path}/documents")
110
- print(max_index)
111
- # check
112
- # max_index = _max_index_id(f"{vector_doc_path}/documents")
113
- # print(max_index)
114
- return max_index
115
-
116
-
117
-
118
-
119
- @app.get("/index/{domain}/")
120
- def get_domain_file_path(domain: str, file_path: str):
121
- print(domain, file_path)
122
- print(os.getcwd())
123
- bool_value = _check_if_db_exists(db_path=f"{os.getcwd()}/index/{domain}/documents")
124
- print(bool_value)
125
- if bool_value:
126
- embeddings = load_embeddings(domain=domain, db_present=bool_value)
127
- print(embeddings)
128
- doc = _load_docs(file_path)
129
- max_index = _upsert_docs(
130
- doc=doc,
131
- embeddings=embeddings,
132
- vector_doc_path=f"{os.getcwd()}/index/{domain}",
133
- db_present=bool_value,
134
- )
135
- # print("-------")
136
- else:
137
- embeddings = load_embeddings(domain=domain, db_present=bool_value)
138
- doc = _load_docs(file_path)
139
- max_index = _upsert_docs(
140
- doc=doc,
141
- embeddings=embeddings,
142
- vector_doc_path=f"{os.getcwd()}/index/{domain}",
143
- db_present=bool_value,
144
- )
145
- # print("Final - output : ", max_index)
146
- return "Executed Successfully!!"
147
-
148
-
149
- def _check_if_db_exists(db_path: str) -> bool:
150
- return os.path.exists(db_path)
151
-
152
-
153
- def _load_embeddings_from_db(
154
- db_present: bool,
155
- domain: str,
156
- path: str = "sentence-transformers/all-MiniLM-L6-v2",
157
- ):
158
- # Create embeddings model with content support
159
- embeddings = Embeddings({"path": path, "content": True})
160
- # if Vector DB is not present
161
- if not db_present:
162
- return embeddings
163
- else:
164
- if domain == "":
165
- embeddings.load("index") # change this later
166
- else:
167
- print(3)
168
- embeddings.load(f"{os.getcwd()}/index/{domain}")
169
- return embeddings
170
-
171
-
172
- def _prompt(question):
173
- return f"""Answer the following question using only the context below. Say 'Could not find answer within the context' when the question can't be answered.
174
- Question: {question}
175
- Context: """
176
-
177
-
178
- def _search(query, extractor, question=None):
179
- # Default question to query if empty
180
- if not question:
181
- question = query
182
-
183
- # template = f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
184
- # Question: {question}
185
- # Context: """
186
-
187
- # prompt = PromptTemplate(template=template, input_variables=["question"])
188
- # llm_chain = LLMChain(prompt=prompt, llm=extractor)
189
-
190
- # return {"question": question, "answer": llm_chain.run(question)}
191
- return extractor([("answer", query, _prompt(question), False)])[0][1]
192
-
193
-
194
- @app.get("/rag")
195
- def rag(domain: str, question: str):
196
- db_exists = _check_if_db_exists(db_path=f"{os.getcwd()}/index/{domain}/documents")
197
- print(db_exists)
198
- # if db_exists:
199
- embeddings = _load_embeddings_from_db(db_exists, domain)
200
- extractor = Extractor(similarity=embeddings, path="google/flan-t5-base")
201
- # llm = HuggingFaceHub(
202
- # repo_id="google/flan-t5-xxl",
203
- # model_kwargs={"temperature": 1, "max_length": 1000000},
204
- # )
205
- # else:
206
- answer = _search(question, extractor)
207
- return {"question": question, "answer": answer}
208
-
209
-
210
-
211
- '''
212
- load embedding and models for extractor during start up
213
- '''
214
-
215
- # Create extractor instance
216
- #extractor = Extractor(path="google/flan-t5-large")
217
- #extractor = Extractor(embeddings, "TheBloke/Llama-2-7B-GGUF")
218
- #extractor = Extractor(embeddings, "google/flan-t5-xl")