Spaces:

ankurmondal
/

text-generation

Sleeping

App Files Files Community

ankurmondal commited on Sep 26, 2023

Commit

04f9592

•

1 Parent(s): 96b85d7

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +1 -1
index.py +168 -0
main.py +42 -22
requirements.txt +6 -1

Dockerfile CHANGED Viewed

@@ -24,4 +24,4 @@ WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
-CMD ["uvicorn", "extractor:app", "--host", "0.0.0.0", "--port", "7860"]

 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
+CMD ["uvicorn", "index:app", "--host", "0.0.0.0", "--port", "7860"]

index.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from fastapi import FastAPI
+# from transformers import pipeline
+from txtai.embeddings import Embeddings
+from txtai.pipeline import Extractor
+from langchain.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import pandas as pd
+import sqlite3
+import os
+# NOTE - we configure docs_url to serve the interactive Docs at the root path
+# of the app. This way, we can use the docs as a landing page for the app on Spaces.
+app = FastAPI(docs_url="/")
+# app = FastAPI()
+# pipe = pipeline("text2text-generation", model="google/flan-t5-small")
+# @app.get("/generate")
+# def generate(text: str):
+#     """
+#     Using the text2text-generation pipeline from `transformers`, generate text
+#     from the given input text. The model used is `google/flan-t5-small`, which
+#     can be found [here](https://huggingface.co/google/flan-t5-small).
+#     """
+#     output = pipe(text)
+#     return {"output": output[0]["generated_text"]}
+def load_embeddings(
+    domain: str = "",
+    db_present: bool = True,
+    path: str = "sentence-transformers/all-MiniLM-L6-v2",
+    index_name: str = "index",
+):
+    # Create embeddings model with content support
+    embeddings = Embeddings({"path": path, "content": True})
+    # if Vector DB is not present
+    if not db_present:
+        return embeddings
+    else:
+        if domain == "":
+            embeddings.load(index_name)  # change this later
+        else:
+            print(3)
+            embeddings.load(f"{index_name}/{domain}")
+        return embeddings
+def _check_if_db_exists(db_path: str) -> bool:
+    return os.path.exists(db_path)
+def _text_splitter(doc):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        length_function=len,
+    )
+    return text_splitter.transform_documents(doc)
+def _load_docs(path: str):
+    load_doc = WebBaseLoader(path).load()
+    doc = _text_splitter(load_doc)
+    return doc
+def _stream(dataset, limit, index: int = 0):
+    for row in dataset:
+        yield (index, row.page_content, None)
+        index += 1
+        if index >= limit:
+            break
+def _max_index_id(path):
+    db = sqlite3.connect(path)
+    table = "sections"
+    df = pd.read_sql_query(f"select * from {table}", db)
+    return {"max_index": df["indexid"].max()}
+def _upsert_docs(doc, embeddings, vector_doc_path: str, db_present: bool):
+    print(vector_doc_path)
+    if db_present:
+        print(1)
+        max_index = _max_index_id(f"{vector_doc_path}/documents")
+        print(max_index)
+        embeddings.upsert(_stream(doc, 500, max_index["max_index"]))
+        print("Embeddings done!!")
+        embeddings.save(vector_doc_path)
+        print("Embeddings done - 1!!")
+    else:
+        print(2)
+        embeddings.index(_stream(doc, 500, 0))
+        embeddings.save(vector_doc_path)
+        max_index = _max_index_id(f"{vector_doc_path}/documents")
+        print(max_index)
+    # check
+    # max_index = _max_index_id(f"{vector_doc_path}/documents")
+    # print(max_index)
+    return max_index
+# def prompt(question):
+#     return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
+#             Question: {question}
+#             Context: """
+# def search(query, question=None):
+#     # Default question to query if empty
+#     if not question:
+#         question = query
+#     return extractor([("answer", query, prompt(question), False)])[0][1]
+# @app.get("/rag")
+# def rag(question: str):
+#     # question = "what is the document about?"
+#     answer = search(question)
+#     # print(question, answer)
+#     return {answer}
+# @app.get("/index")
+# def get_url_file_path(url_path: str):
+#     embeddings = load_embeddings()
+#     doc = _load_docs(url_path)
+#     embeddings, max_index = _upsert_docs(doc, embeddings)
+#     return max_index
+@app.get("/index/{domain}/")
+def get_domain_file_path(domain: str, file_path: str):
+    print(domain, file_path)
+    print(os.getcwd())
+    bool_value = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
+    print(bool_value)
+    if bool_value:
+        embeddings = load_embeddings(domain=domain, db_present=bool_value)
+        print(embeddings)
+        doc = _load_docs(file_path)
+        max_index = _upsert_docs(
+            doc=doc,
+            embeddings=embeddings,
+            vector_doc_path=f"index/{domain}",
+            db_present=bool_value,
+        )
+        # print("-------")
+    else:
+        embeddings = load_embeddings(domain=domain, db_present=bool_value)
+        doc = _load_docs(file_path)
+        max_index = _upsert_docs(
+            doc=doc,
+            embeddings=embeddings,
+            vector_doc_path=f"index/{domain}",
+            db_present=bool_value,
+        )
+    # print("Final - output : ", max_index)
+    return "Executed Successfully!!"

main.py CHANGED Viewed

@@ -1,22 +1,14 @@
 from fastapi import FastAPI
-# from transformers import pipeline
 from txtai.embeddings import Embeddings
 from txtai.pipeline import Extractor
 # NOTE - we configure docs_url to serve the interactive Docs at the root path
 # of the app. This way, we can use the docs as a landing page for the app on Spaces.
 app = FastAPI(docs_url="/")
-# Create embeddings model with content support
-embeddings = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True})
-embeddings.load('index')
-# Create extractor instance
-extractor = Extractor(embeddings, "google/flan-t5-base")
-# pipe = pipeline("text2text-generation", model="google/flan-t5-small")
 # @app.get("/generate")
 # def generate(text: str):
 #     """
@@ -28,23 +20,51 @@ extractor = Extractor(embeddings, "google/flan-t5-base")
 #     return {"output": output[0]["generated_text"]}
-def prompt(question):
-  return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
             Question: {question}
             Context: """
-def search(query, question=None):
-  # Default question to query if empty
-  if not question:
-    question = query
-  return extractor([("answer", query, prompt(question), False)])[0][1]
 @app.get("/rag")
-def rag(question: str):
-    # question = "what is the document about?"
-    answer = search(question)
-    # print(question, answer)
     return {answer}

 from fastapi import FastAPI
 from txtai.embeddings import Embeddings
 from txtai.pipeline import Extractor
+import os
+# from transformers import pipeline
 # NOTE - we configure docs_url to serve the interactive Docs at the root path
 # of the app. This way, we can use the docs as a landing page for the app on Spaces.
 app = FastAPI(docs_url="/")
 # @app.get("/generate")
 # def generate(text: str):
 #     """
 #     return {"output": output[0]["generated_text"]}
+def _check_if_db_exists(db_path: str) -> bool:
+    return os.path.exists(db_path)
+def _load_embeddings_from_db(
+    db_present: bool,
+    domain: str,
+    path: str = "sentence-transformers/all-MiniLM-L6-v2",
+):
+    # Create embeddings model with content support
+    embeddings = Embeddings({"path": path, "content": True})
+    # if Vector DB is not present
+    if not db_present:
+        return embeddings
+    else:
+        if domain == "":
+            embeddings.load("index")  # change this later
+        else:
+            print(3)
+            embeddings.load(f"index/{domain}")
+        return embeddings
+def _prompt(question):
+    return f"""Answer the following question using only the context below. Say 'no answer' when the question can't be answered.
             Question: {question}
             Context: """
+def _search(query, extractor, question=None):
+    # Default question to query if empty
+    if not question:
+        question = query
+    return extractor([("answer", query, _prompt(question), False)])[0][1]
 @app.get("/rag")
+def rag(domain: str, question: str):
+    db_exists = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents")
+    print(db_exists)
+    # if db_exists:
+    embeddings = _load_embeddings_from_db(db_exists, domain)
+    # Create extractor instance
+    extractor = Extractor(embeddings, "google/flan-t5-base")
+    # else:
+    answer = _search(question, extractor)
     return {answer}

requirements.txt CHANGED Viewed

@@ -2,6 +2,11 @@ fastapi==0.74.*
 requests==2.27.*
 uvicorn[standard]==0.17.*
 sentencepiece==0.1.*
 transformers==4.*
 txtai==6.0.*
-langchain==0.0.295

 requests==2.27.*
 uvicorn[standard]==0.17.*
 sentencepiece==0.1.*
+torch==1.11.*
 transformers==4.*
 txtai==6.0.*
+langchain==0.0.301
+langsmith==0.0.40
+bs4==0.0.1
+pandas==2.1.1
+SQLAlchemy==2.0.21