umair894 commited on
Commit
22ee86e
1 Parent(s): eb6aca0

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +32 -5
main.py CHANGED
@@ -7,24 +7,45 @@ from typing import List
7
  import pytesseract
8
  import requests
9
  from io import BytesIO
10
- from top2vec import Top2Vec
11
- from llama_index.node_parser import SimpleNodeParser
12
- llama-index
13
  from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
14
 
 
 
15
 
16
 
17
 
18
 
19
  description = """
20
  ## DocQA
21
-
22
  This app shows how to do Document Question Answering
23
  Check out the docs for the `/predict` endpoint below to try it out!
24
  """
25
 
26
  app = FastAPI(docs_url="/", description=description)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
29
 
30
 
@@ -64,9 +85,14 @@ def load_file(file_url: str, sentences: List[str]):
64
  model_name = "deepset/roberta-base-squad2"
65
 
66
  nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
 
 
 
 
 
67
 
68
  # Define the common context
69
- context = all_text
70
 
71
  # List of questions
72
  questions = sentences
@@ -74,6 +100,7 @@ def load_file(file_url: str, sentences: List[str]):
74
  qa_dict = {}
75
  # Get answers for each question with the same context
76
  for question in questions:
 
77
  QA_input = {
78
  'question': question,
79
  'context': context
 
7
  import pytesseract
8
  import requests
9
  from io import BytesIO
10
+
 
 
11
  from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
12
 
13
+ from top2vec import Top2Vec
14
+ from llama_index.node_parser import SimpleNodeParser
15
 
16
 
17
 
18
 
19
  description = """
20
  ## DocQA
 
21
  This app shows how to do Document Question Answering
22
  Check out the docs for the `/predict` endpoint below to try it out!
23
  """
24
 
25
  app = FastAPI(docs_url="/", description=description)
26
 
27
+ def doc_chunk(data):
28
+ node_parser = SimpleNodeParser.from_defaults(chunk_size=256)
29
+ nodes = node_parser.get_nodes_from_documents(data)
30
+ return nodes
31
+
32
+ def create_train_data(nodes):
33
+ data = []
34
+ for i in range(len(nodes)):
35
+ #print(nodes[i].get_content())
36
+ data.append(nodes[i].get_content())
37
+ return data
38
+
39
+ def get_model(data):
40
+ model = Top2Vec(data, embedding_model='universal-sentence-encoder')
41
+ return model
42
+
43
+ def get_search_result(model, question):
44
+ documents, doc_scores, doc_ids = model.query_documents(question, 1)
45
+
46
+ return documents
47
+
48
+
49
  # pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
50
 
51
 
 
85
  model_name = "deepset/roberta-base-squad2"
86
 
87
  nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
88
+ ##########################
89
+ nodes = doc_chunk(all_text)
90
+ data = create_train_data(nodes)
91
+ model = get_model(data)
92
+ #context = get_search_result(model, question)
93
 
94
  # Define the common context
95
+ #context = all_text
96
 
97
  # List of questions
98
  questions = sentences
 
100
  qa_dict = {}
101
  # Get answers for each question with the same context
102
  for question in questions:
103
+ context = get_search_result(model, question)
104
  QA_input = {
105
  'question': question,
106
  'context': context