BhanuPrakashSamoju commited on
Commit
1a0aea8
1 Parent(s): e44087a

Adding the Text Generator

Browse files
Files changed (6) hide show
  1. Dockerfile +27 -0
  2. app.py +20 -0
  3. extractor.py +94 -0
  4. index.py +168 -0
  5. main.py +85 -0
  6. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.9 image
2
+ FROM python:3.9
3
+
4
+ # Set the working directory to /code
5
+ WORKDIR /code
6
+
7
+ # Copy the current directory contents into the container at /code
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Set up a new user named "user" with user ID 1000
14
+ RUN useradd -m -u 1000 user
15
+ # Switch to the "user" user
16
+ USER user
17
+ # Set home to the user's home directory
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH
20
+
21
+ # Set the working directory to the user's home directory
22
+ WORKDIR $HOME/app
23
+
24
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
+ COPY --chown=user . $HOME/app
26
+
27
+ CMD ["uvicorn", "index:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from transformers import pipeline
3
+
4
+
5
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
6
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
7
+ app = FastAPI(docs_url="/")
8
+
9
+ pipe = pipeline("text2text-generation", model="google/flan-t5-small")
10
+
11
+
12
+ @app.get("/generate")
13
+ def generate(text: str):
14
+ """
15
+ Using the text2text-generation pipeline from `transformers`, generate text
16
+ from the given input text. The model used is `google/flan-t5-small`, which
17
+ can be found [here](https://huggingface.co/google/flan-t5-small).
18
+ """
19
+ output = pipe(text)
20
+ return {"output": output[0]["generated_text"]}
extractor.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ # from transformers import pipeline
4
+ from txtai.embeddings import Embeddings
5
+ from txtai.pipeline import Extractor
6
+ from langchain.document_loaders import WebBaseLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+
9
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
10
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
11
+ app = FastAPI(docs_url="/")
12
+
13
+ # Create embeddings model with content support
14
+ embeddings = Embeddings(
15
+ {"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True}
16
+ )
17
+
18
+
19
+ # Create extractor instance
20
+ # extractor = Extractor(embeddings, "google/flan-t5-base")
21
+
22
+
23
+ def _stream(dataset, limit, index: int = 0):
24
+ for row in dataset:
25
+ yield (index, row.page_content, None)
26
+ index += 1
27
+
28
+ if index >= limit:
29
+ break
30
+
31
+
32
+ def _max_index_id(path):
33
+ db = sqlite3.connect(path)
34
+
35
+ table = "sections"
36
+ df = pd.read_sql_query(f"select * from {table}", db)
37
+ return {"max_index": df["indexid"].max()}
38
+
39
+
40
+ def _prompt(question):
41
+ return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
42
+ Question: {question}
43
+ Context: """
44
+
45
+
46
+ async def _search(query, extractor, question=None):
47
+ # Default question to query if empty
48
+ if not question:
49
+ question = query
50
+
51
+ return extractor([("answer", query, _prompt(question), False)])[0][1]
52
+
53
+
54
+ def _text_splitter(doc):
55
+ text_splitter = RecursiveCharacterTextSplitter(
56
+ chunk_size=500,
57
+ chunk_overlap=50,
58
+ length_function=len,
59
+ )
60
+ return text_splitter.transform_documents(doc)
61
+
62
+
63
+ def _load_docs(path: str):
64
+ load_doc = WebBaseLoader(path).load()
65
+ doc = _text_splitter(load_doc)
66
+ return doc
67
+
68
+
69
+ async def _upsert_docs(doc):
70
+ max_index = _max_index_id("index/documents")
71
+ embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
72
+ embeddings.save("index")
73
+
74
+ return embeddings
75
+
76
+
77
+ @app.put("/rag/{path}")
78
+ async def get_doc_path(path: str):
79
+ return path
80
+
81
+
82
+ @app.get("/rag")
83
+ async def rag(question: str):
84
+ # question = "what is the document about?"
85
+ embeddings.load("index")
86
+ path = await get_doc_path(path)
87
+ doc = _load_docs(path)
88
+ embeddings = _upsert_docs(doc)
89
+
90
+ # Create extractor instance
91
+ extractor = Extractor(embeddings, "google/flan-t5-base")
92
+ answer = await _search(question, extractor)
93
+ # print(question, answer)
94
+ return {answer}
index.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ # from transformers import pipeline
4
+ from txtai.embeddings import Embeddings
5
+ from txtai.pipeline import Extractor
6
+ from langchain.document_loaders import WebBaseLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+
9
+ import pandas as pd
10
+ import sqlite3
11
+ import os
12
+
13
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
14
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
15
+ app = FastAPI(docs_url="/")
16
+ # app = FastAPI()
17
+
18
+ # pipe = pipeline("text2text-generation", model="google/flan-t5-small")
19
+
20
+
21
+ # @app.get("/generate")
22
+ # def generate(text: str):
23
+ # """
24
+ # Using the text2text-generation pipeline from `transformers`, generate text
25
+ # from the given input text. The model used is `google/flan-t5-small`, which
26
+ # can be found [here](https://huggingface.co/google/flan-t5-small).
27
+ # """
28
+ # output = pipe(text)
29
+ # return {"output": output[0]["generated_text"]}
30
+
31
+
32
+ def load_embeddings(
33
+ domain: str = "",
34
+ db_present: bool = True,
35
+ path: str = "sentence-transformers/all-MiniLM-L6-v2",
36
+ index_name: str = "index",
37
+ ):
38
+ # Create embeddings model with content support
39
+ embeddings = Embeddings({"path": path, "content": True})
40
+
41
+ # if Vector DB is not present
42
+ if not db_present:
43
+ return embeddings
44
+ else:
45
+ if domain == "":
46
+ embeddings.load(index_name) # change this later
47
+ else:
48
+ print(3)
49
+ embeddings.load(f"{index_name}/{domain}")
50
+ return embeddings
51
+
52
+
53
+ def _check_if_db_exists(db_path: str) -> bool:
54
+ return os.path.exists(db_path)
55
+
56
+
57
+ def _text_splitter(doc):
58
+ text_splitter = RecursiveCharacterTextSplitter(
59
+ chunk_size=500,
60
+ chunk_overlap=50,
61
+ length_function=len,
62
+ )
63
+ return text_splitter.transform_documents(doc)
64
+
65
+
66
+ def _load_docs(path: str):
67
+ load_doc = WebBaseLoader(path).load()
68
+ doc = _text_splitter(load_doc)
69
+ return doc
70
+
71
+
72
+ def _stream(dataset, limit, index: int = 0):
73
+ for row in dataset:
74
+ yield (index, row.page_content, None)
75
+ index += 1
76
+
77
+ if index >= limit:
78
+ break
79
+
80
+
81
+ def _max_index_id(path):
82
+ db = sqlite3.connect(path)
83
+
84
+ table = "sections"
85
+ df = pd.read_sql_query(f"select * from {table}", db)
86
+ return {"max_index": df["indexid"].max()}
87
+
88
+
89
+ def _upsert_docs(doc, embeddings, vector_doc_path: str, db_present: bool):
90
+ print(vector_doc_path)
91
+ if db_present:
92
+ print(1)
93
+ max_index = _max_index_id(f"{vector_doc_path}/documents")
94
+ print(max_index)
95
+ embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
96
+ print("Embeddings done!!")
97
+ embeddings.save(vector_doc_path)
98
+ print("Embeddings done - 1!!")
99
+ else:
100
+ print(2)
101
+ embeddings.index(_stream(doc, 500, 0))
102
+ embeddings.save(vector_doc_path)
103
+ max_index = _max_index_id(f"{vector_doc_path}/documents")
104
+ print(max_index)
105
+ # check
106
+ # max_index = _max_index_id(f"{vector_doc_path}/documents")
107
+ # print(max_index)
108
+ return max_index
109
+
110
+
111
+ # def prompt(question):
112
+ # return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
113
+ # Question: {question}
114
+ # Context: """
115
+
116
+
117
+ # def search(query, question=None):
118
+ # # Default question to query if empty
119
+ # if not question:
120
+ # question = query
121
+
122
+ # return extractor([("answer", query, prompt(question), False)])[0][1]
123
+
124
+
125
+ # @app.get("/rag")
126
+ # def rag(question: str):
127
+ # # question = "what is the document about?"
128
+ # answer = search(question)
129
+ # # print(question, answer)
130
+ # return {answer}
131
+
132
+
133
+ # @app.get("/index")
134
+ # def get_url_file_path(url_path: str):
135
+ # embeddings = load_embeddings()
136
+ # doc = _load_docs(url_path)
137
+ # embeddings, max_index = _upsert_docs(doc, embeddings)
138
+ # return max_index
139
+
140
+
141
+ @app.get("/index/{domain}/")
142
+ def get_domain_file_path(domain: str, file_path: str):
143
+ print(domain, file_path)
144
+ print(os.getcwd())
145
+ bool_value = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
146
+ print(bool_value)
147
+ if bool_value:
148
+ embeddings = load_embeddings(domain=domain, db_present=bool_value)
149
+ print(embeddings)
150
+ doc = _load_docs(file_path)
151
+ max_index = _upsert_docs(
152
+ doc=doc,
153
+ embeddings=embeddings,
154
+ vector_doc_path=f"index/{domain}",
155
+ db_present=bool_value,
156
+ )
157
+ # print("-------")
158
+ else:
159
+ embeddings = load_embeddings(domain=domain, db_present=bool_value)
160
+ doc = _load_docs(file_path)
161
+ max_index = _upsert_docs(
162
+ doc=doc,
163
+ embeddings=embeddings,
164
+ vector_doc_path=f"index/{domain}",
165
+ db_present=bool_value,
166
+ )
167
+ # print("Final - output : ", max_index)
168
+ return "Executed Successfully!!"
main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from txtai.embeddings import Embeddings
3
+ from txtai.pipeline import Extractor
4
+ import os
5
+ from langchain import HuggingFaceHub
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain.chains import LLMChain
8
+
9
+ # from transformers import pipeline
10
+
11
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
12
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
13
+ app = FastAPI(docs_url="/")
14
+
15
+ # @app.get("/generate")
16
+ # def generate(text: str):
17
+ # """
18
+ # Using the text2text-generation pipeline from `transformers`, generate text
19
+ # from the given input text. The model used is `google/flan-t5-small`, which
20
+ # can be found [here](https://huggingface.co/google/flan-t5-small).
21
+ # """
22
+ # output = pipe(text)
23
+ # return {"output": output[0]["generated_text"]}
24
+
25
+
26
+ def _check_if_db_exists(db_path: str) -> bool:
27
+ return os.path.exists(db_path)
28
+
29
+
30
+ def _load_embeddings_from_db(
31
+ db_present: bool,
32
+ domain: str,
33
+ path: str = "sentence-transformers/all-MiniLM-L6-v2",
34
+ ):
35
+ # Create embeddings model with content support
36
+ embeddings = Embeddings({"path": path, "content": True})
37
+ # if Vector DB is not present
38
+ if not db_present:
39
+ return embeddings
40
+ else:
41
+ if domain == "":
42
+ embeddings.load("index") # change this later
43
+ else:
44
+ print(3)
45
+ embeddings.load(f"index/{domain}")
46
+ return embeddings
47
+
48
+
49
+ def _prompt(question):
50
+ return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
51
+ Question: {question}
52
+ Context: """
53
+
54
+
55
+ def _search(query, extractor, question=None):
56
+ # Default question to query if empty
57
+ if not question:
58
+ question = query
59
+
60
+ # template = f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
61
+ # Question: {question}
62
+ # Context: """
63
+
64
+ # prompt = PromptTemplate(template=template, input_variables=["question"])
65
+ # llm_chain = LLMChain(prompt=prompt, llm=extractor)
66
+
67
+ # return {"question": question, "answer": llm_chain.run(question)}
68
+ return extractor([("answer", query, _prompt(question), False)])[0][1]
69
+
70
+
71
+ @app.get("/rag")
72
+ def rag(domain: str, question: str):
73
+ db_exists = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
74
+ print(db_exists)
75
+ # if db_exists:
76
+ embeddings = _load_embeddings_from_db(db_exists, domain)
77
+ # Create extractor instance
78
+ extractor = Extractor(embeddings, "google/flan-t5-base")
79
+ # llm = HuggingFaceHub(
80
+ # repo_id="google/flan-t5-xxl",
81
+ # model_kwargs={"temperature": 1, "max_length": 1000000},
82
+ # )
83
+ # else:
84
+ answer = _search(question, extractor)
85
+ return {"question": question, "answer": answer}
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.74.*
2
+ requests==2.27.*
3
+ uvicorn[standard]==0.17.*
4
+ sentencepiece==0.1.*
5
+ torch==1.11.*
6
+ transformers==4.*
7
+ txtai==6.0.*
8
+ langchain==0.0.301
9
+ langsmith==0.0.40
10
+ bs4==0.0.1
11
+ pandas==2.1.1
12
+ SQLAlchemy==2.0.21