ankurmondal commited on
Commit
72debe9
1 Parent(s): 8f8d373

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. extractor.py +94 -0
  3. requirements.txt +4 -1
Dockerfile CHANGED
@@ -24,4 +24,4 @@ WORKDIR $HOME/app
24
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
  COPY --chown=user . $HOME/app
26
 
27
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
24
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
  COPY --chown=user . $HOME/app
26
 
27
+ CMD ["uvicorn", "extractor:app", "--host", "0.0.0.0", "--port", "7860"]
extractor.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ # from transformers import pipeline
4
+ from txtai.embeddings import Embeddings
5
+ from txtai.pipeline import Extractor
6
+ from langchain.document_loaders import WebBaseLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+
9
+ # NOTE - we configure docs_url to serve the interactive Docs at the root path
10
+ # of the app. This way, we can use the docs as a landing page for the app on Spaces.
11
+ app = FastAPI(docs_url="/")
12
+
13
+ # Create embeddings model with content support
14
+ embeddings = Embeddings(
15
+ {"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True}
16
+ )
17
+
18
+
19
+ # Create extractor instance
20
+ # extractor = Extractor(embeddings, "google/flan-t5-base")
21
+
22
+
23
+ def _stream(dataset, limit, index: int = 0):
24
+ for row in dataset:
25
+ yield (index, row.page_content, None)
26
+ index += 1
27
+
28
+ if index >= limit:
29
+ break
30
+
31
+
32
+ def _max_index_id(path):
33
+ db = sqlite3.connect(path)
34
+
35
+ table = "sections"
36
+ df = pd.read_sql_query(f"select * from {table}", db)
37
+ return {"max_index": df["indexid"].max()}
38
+
39
+
40
+ def _prompt(question):
41
+ return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
42
+ Question: {question}
43
+ Context: """
44
+
45
+
46
+ def _search(query, extractor, question=None):
47
+ # Default question to query if empty
48
+ if not question:
49
+ question = query
50
+
51
+ return extractor([("answer", query, _prompt(question), False)])[0][1]
52
+
53
+
54
+ def _text_splitter(doc):
55
+ text_splitter = RecursiveCharacterTextSplitter(
56
+ chunk_size=500,
57
+ chunk_overlap=50,
58
+ length_function=len,
59
+ )
60
+ return text_splitter.transform_documents(doc)
61
+
62
+
63
+ def _load_docs(path: str):
64
+ load_doc = WebBaseLoader(path).load()
65
+ doc = _text_splitter(load_doc)
66
+ return doc
67
+
68
+
69
+ async def _upsert_docs(doc):
70
+ max_index = _max_index_id("index/documents")
71
+ embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
72
+ embeddings.save("index")
73
+
74
+ return embeddings
75
+
76
+
77
+ @app.put("/rag/{path}")
78
+ async def get_doc_path(path: str):
79
+ return path
80
+
81
+
82
+ @app.get("/rag")
83
+ async def rag(question: str):
84
+ # question = "what is the document about?"
85
+ embeddings.load("index")
86
+ path = await get_doc_path(path)
87
+ doc = _load_docs(path)
88
+ embeddings = _upsert_docs(doc)
89
+
90
+ # Create extractor instance
91
+ extractor = Extractor(embeddings, "google/flan-t5-base")
92
+ answer = _search(question, extractor)
93
+ # print(question, answer)
94
+ return {answer}
requirements.txt CHANGED
@@ -2,4 +2,7 @@ fastapi==0.74.*
2
  requests==2.27.*
3
  uvicorn[standard]==0.17.*
4
  sentencepiece==0.1.*
5
- txtai==6.0.*
 
 
 
 
2
  requests==2.27.*
3
  uvicorn[standard]==0.17.*
4
  sentencepiece==0.1.*
5
+ torch==1.11.*
6
+ transformers==4.*
7
+ txtai==6.0.*
8
+ langchain==0.0.295