Spaces:

lfoppiano
/

document-qa

Running

App Files Files Community

lfoppiano commited on Dec 13, 2023

Commit

2397955

1 Parent(s): ddff75f

enable extraction of coordinates from pdf, using sentences

Browse files

Files changed (3) hide show

document_qa/document_qa_engine.py +11 -8
document_qa/grobid_processors.py +19 -13
streamlit_app.py +17 -7

document_qa/document_qa_engine.py CHANGED Viewed

@@ -56,7 +56,7 @@ class DocumentQAEngine:
             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
-                coordinates=["p"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True
@@ -104,7 +104,7 @@ class DocumentQAEngine:
         if verbose:
             print(query)
-        response = self._run_query(doc_id, query, context_size=context_size)
         response = response['output_text'] if 'output_text' in response else response
         if verbose:
@@ -115,17 +115,17 @@ class DocumentQAEngine:
                 return self._parse_json(response, output_parser), response
             except Exception as oe:
                 print("Failing to parse the response", oe)
-                return None, response
         elif extraction_schema:
             try:
                 chain = create_extraction_chain(extraction_schema, self.llm)
                 parsed = chain.run(response)
-                return parsed, response
             except Exception as oe:
                 print("Failing to parse the response", oe)
-                return None, response
         else:
-            return None, response
     def query_storage(self, query: str, doc_id, context_size=4):
         documents = self._get_context(doc_id, query, context_size)
@@ -156,12 +156,13 @@ class DocumentQAEngine:
     def _run_query(self, doc_id, query, context_size=4):
         relevant_documents = self._get_context(doc_id, query, context_size)
         response = self.chain.run(input_documents=relevant_documents,
                                   question=query)
         if self.memory:
             self.memory.save_context({"input": query}, {"output": response})
-        return response
     def _get_context(self, doc_id, query, context_size=4):
         db = self.embeddings_dict[doc_id]
@@ -194,7 +195,8 @@ class DocumentQAEngine:
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
-        structure = self.grobid_processor.process_structure(pdf_file_path)
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")
@@ -215,6 +217,7 @@ class DocumentQAEngine:
                     biblio_copy['type'] = passage['type']
                     biblio_copy['section'] = passage['section']
                     biblio_copy['subSection'] = passage['subSection']
                     metadatas.append(biblio_copy)
                     ids.append(passage['passage_id'])

             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
+                coordinates=["s"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True
         if verbose:
             print(query)
+        response, coordinates = self._run_query(doc_id, query, context_size=context_size)
         response = response['output_text'] if 'output_text' in response else response
         if verbose:
                 return self._parse_json(response, output_parser), response
             except Exception as oe:
                 print("Failing to parse the response", oe)
+                return None, response, coordinates
         elif extraction_schema:
             try:
                 chain = create_extraction_chain(extraction_schema, self.llm)
                 parsed = chain.run(response)
+                return parsed, response, coordinates
             except Exception as oe:
                 print("Failing to parse the response", oe)
+                return None, response, coordinates
         else:
+            return None, response, coordinates
     def query_storage(self, query: str, doc_id, context_size=4):
         documents = self._get_context(doc_id, query, context_size)
     def _run_query(self, doc_id, query, context_size=4):
         relevant_documents = self._get_context(doc_id, query, context_size)
+        relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else [] for doc in relevant_documents] #filter(lambda d: d['type'] == "sentence", relevant_documents)]
         response = self.chain.run(input_documents=relevant_documents,
                                   question=query)
         if self.memory:
             self.memory.save_context({"input": query}, {"output": response})
+        return response, relevant_document_coordinates
     def _get_context(self, doc_id, query, context_size=4):
         db = self.embeddings_dict[doc_id]
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
+        coordinates = True if chunk_size == -1 else False
+        structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")
                     biblio_copy['type'] = passage['type']
                     biblio_copy['section'] = passage['section']
                     biblio_copy['subSection'] = passage['subSection']
+                    biblio_copy['coordinates'] = passage['coordinates']
                     metadatas.append(biblio_copy)
                     ids.append(passage['passage_id'])

document_qa/grobid_processors.py CHANGED Viewed

@@ -131,13 +131,13 @@ class GrobidProcessor(BaseProcessor):
         # super().__init__()
         self.grobid_client = grobid_client
-    def process_structure(self, input_path):
         pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
-                                                                segment_sentences=False,
-                                                                tei_coordinates=False,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
                                                                 generateIDs=True)
@@ -145,7 +145,7 @@ class GrobidProcessor(BaseProcessor):
         if status != 200:
             return
-        output_data = self.parse_grobid_xml(text)
         output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
         return output_data
@@ -159,7 +159,7 @@ class GrobidProcessor(BaseProcessor):
         return doc
-    def parse_grobid_xml(self, text):
         output_data = OrderedDict()
         doc_biblio = grobid_tei_xml.parse_document_xml(text)
@@ -188,17 +188,20 @@ class GrobidProcessor(BaseProcessor):
         #         "passage_id": "title0"
         #     })
         if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
             passages.append({
                 "text": self.post_process(doc_biblio.abstract),
-                "type": "paragraph",
                 "section": "<header>",
                 "subSection": "<abstract>",
-                "passage_id": "abstract0"
             })
         soup = BeautifulSoup(text, 'xml')
-        text_blocks_body = get_children_body(soup, verbose=False)
         passages.extend([
             {
@@ -206,10 +209,12 @@ class GrobidProcessor(BaseProcessor):
                                                   text.parent.name != "ref" or (
                                                           text.parent.name == "ref" and text.parent.attrs[
                                                       'type'] != 'bibr'))),
-                "type": "paragraph",
                 "section": "<body>",
-                "subSection": "<paragraph>",
-                "passage_id": str(paragraph_id) + str(sentence_id)
             }
             for paragraph_id, paragraph in enumerate(text_blocks_body) for
             sentence_id, sentence in enumerate(paragraph)
@@ -223,10 +228,11 @@ class GrobidProcessor(BaseProcessor):
                                                   text.parent.name != "ref" or (
                                                           text.parent.name == "ref" and text.parent.attrs[
                                                       'type'] != 'bibr'))),
-                "type": "paragraph",
                 "section": "<body>",
                 "subSection": "<figure>",
-                "passage_id": str(paragraph_id) + str(sentence_id)
             }
             for paragraph_id, paragraph in enumerate(text_blocks_figures) for
             sentence_id, sentence in enumerate(paragraph)

         # super().__init__()
         self.grobid_client = grobid_client
+    def process_structure(self, input_path, coordinates=False):
         pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
+                                                                segment_sentences=True,
+                                                                tei_coordinates=coordinates,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
                                                                 generateIDs=True)
         if status != 200:
             return
+        output_data = self.parse_grobid_xml(text, coordinates=coordinates)
         output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
         return output_data
         return doc
+    def parse_grobid_xml(self, text, coordinates=False):
         output_data = OrderedDict()
         doc_biblio = grobid_tei_xml.parse_document_xml(text)
         #         "passage_id": "title0"
         #     })
+        passage_type = "sentence" if coordinates else "paragraph"
         if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
             passages.append({
                 "text": self.post_process(doc_biblio.abstract),
+                "type": passage_type,
                 "section": "<header>",
                 "subSection": "<abstract>",
+                "passage_id": "abstract0",
+                "coordinates": ""
             })
         soup = BeautifulSoup(text, 'xml')
+        text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)
         passages.extend([
             {
                                                   text.parent.name != "ref" or (
                                                           text.parent.name == "ref" and text.parent.attrs[
                                                       'type'] != 'bibr'))),
+                "type": passage_type,
                 "section": "<body>",
+                "subSection": "<sentence>",
+                "passage_id": str(paragraph_id) + str(sentence_id),
+                # "coordinates": sentence['coords'].split(";") if coordinates else []
+                "coordinates": sentence['coords'] if coordinates else ""
             }
             for paragraph_id, paragraph in enumerate(text_blocks_body) for
             sentence_id, sentence in enumerate(paragraph)
                                                   text.parent.name != "ref" or (
                                                           text.parent.name == "ref" and text.parent.attrs[
                                                       'type'] != 'bibr'))),
+                "type": passage_type,
                 "section": "<body>",
                 "subSection": "<figure>",
+                "passage_id": str(paragraph_id) + str(sentence_id),
+                "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
             }
             for paragraph_id, paragraph in enumerate(text_blocks_figures) for
             sentence_id, sentence in enumerate(paragraph)

streamlit_app.py CHANGED Viewed

@@ -59,6 +59,12 @@ if 'memory' not in st.session_state:
 if 'binary' not in st.session_state:
     st.session_state['binary'] = None
 st.set_page_config(
     page_title="Scientific Document Insights Q/A",
     page_icon="📝",
@@ -290,7 +296,7 @@ with st.sidebar:
     mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
                     help="LLM will respond the question, Embedding will show the "
                          "paragraphs relevant to the question in the paper.")
-    chunk_size = st.slider("Chunks size", 100, 2000, value=250,
                            help="Size of chunks in which the document is partitioned",
                            disabled=uploaded_file is not None)
     context_size = st.slider("Context size", 3, 10, value=4,
@@ -320,8 +326,6 @@ with st.sidebar:
     st.markdown(
         """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
 if uploaded_file and not st.session_state.loaded_embeddings:
     if model not in st.session_state['api_keys']:
         st.error("Before uploading a document, you must enter the API key. ")
@@ -344,8 +348,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
     # timestamp = datetime.utcnow()
 with left_column:
-    if st.session_state['binary']:
-        pdf_viewer(st.session_state['binary'])
 with right_column:
     # css = '''
@@ -389,8 +393,14 @@ with right_column:
                                                                              context_size=context_size)
         elif mode == "LLM":
             with st.spinner("Generating response..."):
-                _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
-                                                                                 context_size=context_size)
         if not text_response:
             st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")

 if 'binary' not in st.session_state:
     st.session_state['binary'] = None
+if 'annotations' not in st.session_state:
+    st.session_state['annotations'] = None
+if 'pdf' not in st.session_state:
+    st.session_state['pdf'] = None
 st.set_page_config(
     page_title="Scientific Document Insights Q/A",
     page_icon="📝",
     mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
                     help="LLM will respond the question, Embedding will show the "
                          "paragraphs relevant to the question in the paper.")
+    chunk_size = st.slider("Chunks size", -1, 2000, value=250,
                            help="Size of chunks in which the document is partitioned",
                            disabled=uploaded_file is not None)
     context_size = st.slider("Context size", 3, 10, value=4,
     st.markdown(
         """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
 if uploaded_file and not st.session_state.loaded_embeddings:
     if model not in st.session_state['api_keys']:
         st.error("Before uploading a document, you must enter the API key. ")
     # timestamp = datetime.utcnow()
 with left_column:
+    if st.session_state['annotations']:
+        pdf_viewer(input=st.session_state['binary'], annotations=st.session_state['annotations'])
 with right_column:
     # css = '''
                                                                              context_size=context_size)
         elif mode == "LLM":
             with st.spinner("Generating response..."):
+                _, text_response, coordinates = st.session_state['rqa'][model].query_document(question,
+                                                                                              st.session_state.doc_id,
+                                                                                              context_size=context_size)
+                st.session_state['annotations'] = [
+                    {"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "blue"} for coo in [c.split(",") for coord in
+                    coordinates for c in coord]]
+                # with left_column:
+                #     pdf_viewer(input=st.session_state['binary'], annotations=st.session_state['annotations'], key=1)
         if not text_response:
             st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")