Spaces:
Running
Running
include title, authors and year in the data store
Browse files- document_qa/document_qa_engine.py +30 -8
- streamlit_app.py +2 -1
document_qa/document_qa_engine.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import copy
|
|
|
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Union, Any
|
|
@@ -173,8 +174,10 @@ class DocumentQAEngine:
|
|
| 173 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
| 174 |
return relevant_documents
|
| 175 |
|
| 176 |
-
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
| 177 |
-
"""
|
|
|
|
|
|
|
| 178 |
if verbose:
|
| 179 |
print("File", pdf_file_path)
|
| 180 |
filename = Path(pdf_file_path).stem
|
|
@@ -189,6 +192,7 @@ class DocumentQAEngine:
|
|
| 189 |
texts = []
|
| 190 |
metadatas = []
|
| 191 |
ids = []
|
|
|
|
| 192 |
if chunk_size < 0:
|
| 193 |
for passage in structure['passages']:
|
| 194 |
biblio_copy = copy.copy(biblio)
|
|
@@ -212,10 +216,25 @@ class DocumentQAEngine:
|
|
| 212 |
metadatas = [biblio for _ in range(len(texts))]
|
| 213 |
ids = [id for id, t in enumerate(texts)]
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
return texts, metadatas, ids
|
| 216 |
|
| 217 |
-
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
if doc_id:
|
| 220 |
hash = doc_id
|
| 221 |
else:
|
|
@@ -233,7 +252,7 @@ class DocumentQAEngine:
|
|
| 233 |
|
| 234 |
return hash
|
| 235 |
|
| 236 |
-
def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
|
| 237 |
input_files = []
|
| 238 |
for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
|
| 239 |
for file_ in files:
|
|
@@ -250,9 +269,12 @@ class DocumentQAEngine:
|
|
| 250 |
if os.path.exists(data_path):
|
| 251 |
print(data_path, "exists. Skipping it ")
|
| 252 |
continue
|
| 253 |
-
|
| 254 |
-
texts, metadata, ids = self.get_text_from_document(
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
| 256 |
filename = metadata[0]['filename']
|
| 257 |
|
| 258 |
vector_db_document = Chroma.from_texts(texts,
|
|
|
|
| 1 |
import copy
|
| 2 |
+
import json
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Union, Any
|
|
|
|
| 174 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
| 175 |
return relevant_documents
|
| 176 |
|
| 177 |
+
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
|
| 178 |
+
"""
|
| 179 |
+
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
| 180 |
+
"""
|
| 181 |
if verbose:
|
| 182 |
print("File", pdf_file_path)
|
| 183 |
filename = Path(pdf_file_path).stem
|
|
|
|
| 192 |
texts = []
|
| 193 |
metadatas = []
|
| 194 |
ids = []
|
| 195 |
+
|
| 196 |
if chunk_size < 0:
|
| 197 |
for passage in structure['passages']:
|
| 198 |
biblio_copy = copy.copy(biblio)
|
|
|
|
| 216 |
metadatas = [biblio for _ in range(len(texts))]
|
| 217 |
ids = [id for id, t in enumerate(texts)]
|
| 218 |
|
| 219 |
+
if "biblio" in include:
|
| 220 |
+
biblio_metadata = copy.copy(biblio)
|
| 221 |
+
biblio_metadata['type'] = "biblio"
|
| 222 |
+
biblio_metadata['section'] = "header"
|
| 223 |
+
for key in ['title', 'authors', 'year']:
|
| 224 |
+
if key in biblio_metadata:
|
| 225 |
+
texts.append("{}: {}".format(key, biblio_metadata[key]))
|
| 226 |
+
metadatas.append(biblio_metadata)
|
| 227 |
+
ids.append(key)
|
| 228 |
+
|
| 229 |
return texts, metadatas, ids
|
| 230 |
|
| 231 |
+
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
|
| 232 |
+
include = ["biblio"] if include_biblio else []
|
| 233 |
+
texts, metadata, ids = self.get_text_from_document(
|
| 234 |
+
pdf_path,
|
| 235 |
+
chunk_size=chunk_size,
|
| 236 |
+
perc_overlap=perc_overlap,
|
| 237 |
+
include=include)
|
| 238 |
if doc_id:
|
| 239 |
hash = doc_id
|
| 240 |
else:
|
|
|
|
| 252 |
|
| 253 |
return hash
|
| 254 |
|
| 255 |
+
def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1, include_biblio=False):
|
| 256 |
input_files = []
|
| 257 |
for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
|
| 258 |
for file_ in files:
|
|
|
|
| 269 |
if os.path.exists(data_path):
|
| 270 |
print(data_path, "exists. Skipping it ")
|
| 271 |
continue
|
| 272 |
+
include = ["biblio"] if include_biblio else []
|
| 273 |
+
texts, metadata, ids = self.get_text_from_document(
|
| 274 |
+
input_file,
|
| 275 |
+
chunk_size=chunk_size,
|
| 276 |
+
perc_overlap=perc_overlap,
|
| 277 |
+
include=include)
|
| 278 |
filename = metadata[0]['filename']
|
| 279 |
|
| 280 |
vector_db_document = Chroma.from_texts(texts,
|
streamlit_app.py
CHANGED
|
@@ -283,7 +283,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
|
|
| 283 |
# hash = get_file_hash(tmp_file.name)[:10]
|
| 284 |
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
|
| 285 |
chunk_size=chunk_size,
|
| 286 |
-
perc_overlap=0.1
|
|
|
|
| 287 |
st.session_state['loaded_embeddings'] = True
|
| 288 |
st.session_state.messages = []
|
| 289 |
|
|
|
|
| 283 |
# hash = get_file_hash(tmp_file.name)[:10]
|
| 284 |
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
|
| 285 |
chunk_size=chunk_size,
|
| 286 |
+
perc_overlap=0.1,
|
| 287 |
+
include_biblio=True)
|
| 288 |
st.session_state['loaded_embeddings'] = True
|
| 289 |
st.session_state.messages = []
|
| 290 |
|