fastapi-document-qa_semantic

Runtime error

App Files Files Community

umair894 commited on Sep 18, 2023

Commit

22ee86e

1 Parent(s): eb6aca0

Update main.py

Browse files

Files changed (1) hide show

main.py +32 -5

main.py CHANGED Viewed

@@ -7,24 +7,45 @@ from typing import List
 import pytesseract
 import requests
 from io import BytesIO
-from top2vec import Top2Vec
-from llama_index.node_parser import SimpleNodeParser
-llama-index
 from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
 description = """
 ## DocQA
 This app shows how to do Document Question Answering
 Check out the docs for the `/predict` endpoint below to try it out!
 """
 app = FastAPI(docs_url="/", description=description)
 # pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
@@ -64,9 +85,14 @@ def load_file(file_url: str, sentences: List[str]):
         model_name = "deepset/roberta-base-squad2"
         nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
         # Define the common context
-        context = all_text
         # List of questions
         questions = sentences
@@ -74,6 +100,7 @@ def load_file(file_url: str, sentences: List[str]):
         qa_dict = {}
         # Get answers for each question with the same context
         for question in questions:
             QA_input = {
                 'question': question,
                 'context': context

 import pytesseract
 import requests
 from io import BytesIO
 from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
+from top2vec import Top2Vec
+from llama_index.node_parser import SimpleNodeParser
 description = """
 ## DocQA
 This app shows how to do Document Question Answering
 Check out the docs for the `/predict` endpoint below to try it out!
 """
 app = FastAPI(docs_url="/", description=description)
+def doc_chunk(data):
+  node_parser = SimpleNodeParser.from_defaults(chunk_size=256)
+  nodes = node_parser.get_nodes_from_documents(data)
+  return nodes
+def create_train_data(nodes):
+  data = []
+  for i in range(len(nodes)):
+    #print(nodes[i].get_content())
+    data.append(nodes[i].get_content())
+  return data
+def get_model(data):
+  model = Top2Vec(data, embedding_model='universal-sentence-encoder')
+  return model
+def get_search_result(model, question):
+  documents, doc_scores, doc_ids  = model.query_documents(question, 1)
+  return documents
 # pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
         model_name = "deepset/roberta-base-squad2"
         nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
+        ##########################
+        nodes = doc_chunk(all_text)
+        data = create_train_data(nodes)
+        model = get_model(data)
+        #context = get_search_result(model, question)
         # Define the common context
+        #context = all_text
         # List of questions
         questions = sentences
         qa_dict = {}
         # Get answers for each question with the same context
         for question in questions:
+            context = get_search_result(model, question)
             QA_input = {
                 'question': question,
                 'context': context