ankurmondal commited on
Commit
04f9592
1 Parent(s): 96b85d7

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -1
  2. index.py +168 -0
  3. main.py +42 -22
  4. requirements.txt +6 -1
Dockerfile CHANGED
@@ -24,4 +24,4 @@ WORKDIR $HOME/app
24
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
  COPY --chown=user . $HOME/app
26
 
27
- CMD ["uvicorn", "extractor:app", "--host", "0.0.0.0", "--port", "7860"]
 
24
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
  COPY --chown=user . $HOME/app
26
 
27
+ CMD ["uvicorn", "index:app", "--host", "0.0.0.0", "--port", "7860"]
index.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ # from transformers import pipeline
4
+ from txtai.embeddings import Embeddings
5
+ from txtai.pipeline import Extractor
6
+ from langchain.document_loaders import WebBaseLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+
9
+ import pandas as pd
10
+ import sqlite3
11
+ import os
12
+
13
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
14
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
15
+ app = FastAPI(docs_url="/")
16
+ # app = FastAPI()
17
+
18
+ # pipe = pipeline("text2text-generation", model="google/flan-t5-small")
19
+
20
+
21
+ # @app.get("/generate")
22
+ # def generate(text: str):
23
+ # """
24
+ # Using the text2text-generation pipeline from `transformers`, generate text
25
+ # from the given input text. The model used is `google/flan-t5-small`, which
26
+ # can be found [here](https://huggingface.co/google/flan-t5-small).
27
+ # """
28
+ # output = pipe(text)
29
+ # return {"output": output[0]["generated_text"]}
30
+
31
+
32
+ def load_embeddings(
33
+ domain: str = "",
34
+ db_present: bool = True,
35
+ path: str = "sentence-transformers/all-MiniLM-L6-v2",
36
+ index_name: str = "index",
37
+ ):
38
+ # Create embeddings model with content support
39
+ embeddings = Embeddings({"path": path, "content": True})
40
+
41
+ # if Vector DB is not present
42
+ if not db_present:
43
+ return embeddings
44
+ else:
45
+ if domain == "":
46
+ embeddings.load(index_name) # change this later
47
+ else:
48
+ print(3)
49
+ embeddings.load(f"{index_name}/{domain}")
50
+ return embeddings
51
+
52
+
53
+ def _check_if_db_exists(db_path: str) -> bool:
54
+ return os.path.exists(db_path)
55
+
56
+
57
+ def _text_splitter(doc):
58
+ text_splitter = RecursiveCharacterTextSplitter(
59
+ chunk_size=500,
60
+ chunk_overlap=50,
61
+ length_function=len,
62
+ )
63
+ return text_splitter.transform_documents(doc)
64
+
65
+
66
+ def _load_docs(path: str):
67
+ load_doc = WebBaseLoader(path).load()
68
+ doc = _text_splitter(load_doc)
69
+ return doc
70
+
71
+
72
+ def _stream(dataset, limit, index: int = 0):
73
+ for row in dataset:
74
+ yield (index, row.page_content, None)
75
+ index += 1
76
+
77
+ if index >= limit:
78
+ break
79
+
80
+
81
+ def _max_index_id(path):
82
+ db = sqlite3.connect(path)
83
+
84
+ table = "sections"
85
+ df = pd.read_sql_query(f"select * from {table}", db)
86
+ return {"max_index": df["indexid"].max()}
87
+
88
+
89
+ def _upsert_docs(doc, embeddings, vector_doc_path: str, db_present: bool):
90
+ print(vector_doc_path)
91
+ if db_present:
92
+ print(1)
93
+ max_index = _max_index_id(f"{vector_doc_path}/documents")
94
+ print(max_index)
95
+ embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
96
+ print("Embeddings done!!")
97
+ embeddings.save(vector_doc_path)
98
+ print("Embeddings done - 1!!")
99
+ else:
100
+ print(2)
101
+ embeddings.index(_stream(doc, 500, 0))
102
+ embeddings.save(vector_doc_path)
103
+ max_index = _max_index_id(f"{vector_doc_path}/documents")
104
+ print(max_index)
105
+ # check
106
+ # max_index = _max_index_id(f"{vector_doc_path}/documents")
107
+ # print(max_index)
108
+ return max_index
109
+
110
+
111
+ # def prompt(question):
112
+ # return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
113
+ # Question: {question}
114
+ # Context: """
115
+
116
+
117
+ # def search(query, question=None):
118
+ # # Default question to query if empty
119
+ # if not question:
120
+ # question = query
121
+
122
+ # return extractor([("answer", query, prompt(question), False)])[0][1]
123
+
124
+
125
+ # @app.get("/rag")
126
+ # def rag(question: str):
127
+ # # question = "what is the document about?"
128
+ # answer = search(question)
129
+ # # print(question, answer)
130
+ # return {answer}
131
+
132
+
133
+ # @app.get("/index")
134
+ # def get_url_file_path(url_path: str):
135
+ # embeddings = load_embeddings()
136
+ # doc = _load_docs(url_path)
137
+ # embeddings, max_index = _upsert_docs(doc, embeddings)
138
+ # return max_index
139
+
140
+
141
+ @app.get("/index/{domain}/")
142
+ def get_domain_file_path(domain: str, file_path: str):
143
+ print(domain, file_path)
144
+ print(os.getcwd())
145
+ bool_value = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
146
+ print(bool_value)
147
+ if bool_value:
148
+ embeddings = load_embeddings(domain=domain, db_present=bool_value)
149
+ print(embeddings)
150
+ doc = _load_docs(file_path)
151
+ max_index = _upsert_docs(
152
+ doc=doc,
153
+ embeddings=embeddings,
154
+ vector_doc_path=f"index/{domain}",
155
+ db_present=bool_value,
156
+ )
157
+ # print("-------")
158
+ else:
159
+ embeddings = load_embeddings(domain=domain, db_present=bool_value)
160
+ doc = _load_docs(file_path)
161
+ max_index = _upsert_docs(
162
+ doc=doc,
163
+ embeddings=embeddings,
164
+ vector_doc_path=f"index/{domain}",
165
+ db_present=bool_value,
166
+ )
167
+ # print("Final - output : ", max_index)
168
+ return "Executed Successfully!!"
main.py CHANGED
@@ -1,22 +1,14 @@
1
  from fastapi import FastAPI
2
- # from transformers import pipeline
3
  from txtai.embeddings import Embeddings
4
  from txtai.pipeline import Extractor
 
 
 
5
 
6
  # NOTE - we configure docs_url to serve the interactive Docs at the root path
7
  # of the app. This way, we can use the docs as a landing page for the app on Spaces.
8
  app = FastAPI(docs_url="/")
9
 
10
- # Create embeddings model with content support
11
- embeddings = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True})
12
- embeddings.load('index')
13
-
14
- # Create extractor instance
15
- extractor = Extractor(embeddings, "google/flan-t5-base")
16
-
17
- # pipe = pipeline("text2text-generation", model="google/flan-t5-small")
18
-
19
-
20
  # @app.get("/generate")
21
  # def generate(text: str):
22
  # """
@@ -28,23 +20,51 @@ extractor = Extractor(embeddings, "google/flan-t5-base")
28
  # return {"output": output[0]["generated_text"]}
29
 
30
 
31
- def prompt(question):
32
- return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  Question: {question}
34
  Context: """
35
 
36
 
37
- def search(query, question=None):
38
- # Default question to query if empty
39
- if not question:
40
- question = query
41
 
42
- return extractor([("answer", query, prompt(question), False)])[0][1]
43
 
44
 
45
  @app.get("/rag")
46
- def rag(question: str):
47
- # question = "what is the document about?"
48
- answer = search(question)
49
- # print(question, answer)
 
 
 
 
 
50
  return {answer}
 
1
  from fastapi import FastAPI
 
2
  from txtai.embeddings import Embeddings
3
  from txtai.pipeline import Extractor
4
+ import os
5
+
6
+ # from transformers import pipeline
7
 
8
  # NOTE - we configure docs_url to serve the interactive Docs at the root path
9
  # of the app. This way, we can use the docs as a landing page for the app on Spaces.
10
  app = FastAPI(docs_url="/")
11
 
 
 
 
 
 
 
 
 
 
 
12
  # @app.get("/generate")
13
  # def generate(text: str):
14
  # """
 
20
  # return {"output": output[0]["generated_text"]}
21
 
22
 
23
+ def _check_if_db_exists(db_path: str) -> bool:
24
+ return os.path.exists(db_path)
25
+
26
+
27
+ def _load_embeddings_from_db(
28
+ db_present: bool,
29
+ domain: str,
30
+ path: str = "sentence-transformers/all-MiniLM-L6-v2",
31
+ ):
32
+ # Create embeddings model with content support
33
+ embeddings = Embeddings({"path": path, "content": True})
34
+ # if Vector DB is not present
35
+ if not db_present:
36
+ return embeddings
37
+ else:
38
+ if domain == "":
39
+ embeddings.load("index") # change this later
40
+ else:
41
+ print(3)
42
+ embeddings.load(f"index/{domain}")
43
+ return embeddings
44
+
45
+
46
+ def _prompt(question):
47
+ return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
48
  Question: {question}
49
  Context: """
50
 
51
 
52
+ def _search(query, extractor, question=None):
53
+ # Default question to query if empty
54
+ if not question:
55
+ question = query
56
 
57
+ return extractor([("answer", query, _prompt(question), False)])[0][1]
58
 
59
 
60
  @app.get("/rag")
61
+ def rag(domain: str, question: str):
62
+ db_exists = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
63
+ print(db_exists)
64
+ # if db_exists:
65
+ embeddings = _load_embeddings_from_db(db_exists, domain)
66
+ # Create extractor instance
67
+ extractor = Extractor(embeddings, "google/flan-t5-base")
68
+ # else:
69
+ answer = _search(question, extractor)
70
  return {answer}
requirements.txt CHANGED
@@ -2,6 +2,11 @@ fastapi==0.74.*
2
  requests==2.27.*
3
  uvicorn[standard]==0.17.*
4
  sentencepiece==0.1.*
 
5
  transformers==4.*
6
  txtai==6.0.*
7
- langchain==0.0.295
 
 
 
 
 
2
  requests==2.27.*
3
  uvicorn[standard]==0.17.*
4
  sentencepiece==0.1.*
5
+ torch==1.11.*
6
  transformers==4.*
7
  txtai==6.0.*
8
+ langchain==0.0.301
9
+ langsmith==0.0.40
10
+ bs4==0.0.1
11
+ pandas==2.1.1
12
+ SQLAlchemy==2.0.21