Spaces:
Running
Running
fix import, and reformat
Browse files- document_qa/document_qa_engine.py +1 -1
- document_qa/grobid_processors.py +5 -2
- streamlit_app.py +18 -10
document_qa/document_qa_engine.py
CHANGED
|
@@ -12,7 +12,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 12 |
from langchain.vectorstores import Chroma
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
-
from grobid_processors import GrobidProcessor
|
| 16 |
|
| 17 |
|
| 18 |
class DocumentQAEngine:
|
|
|
|
| 12 |
from langchain.vectorstores import Chroma
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
+
from document_qa.grobid_processors import GrobidProcessor
|
| 16 |
|
| 17 |
|
| 18 |
class DocumentQAEngine:
|
document_qa/grobid_processors.py
CHANGED
|
@@ -413,7 +413,8 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
| 413 |
|
| 414 |
def extract_materials(self, text):
|
| 415 |
preprocessed_text = text.strip()
|
| 416 |
-
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
|
|
|
| 417 |
|
| 418 |
if status != 200:
|
| 419 |
result = {}
|
|
@@ -679,6 +680,7 @@ class XmlProcessor(BaseProcessor):
|
|
| 679 |
|
| 680 |
return output_data
|
| 681 |
|
|
|
|
| 682 |
def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
| 683 |
children = []
|
| 684 |
|
|
@@ -697,6 +699,7 @@ def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
|
| 697 |
|
| 698 |
return children
|
| 699 |
|
|
|
|
| 700 |
def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
|
| 701 |
children = []
|
| 702 |
|
|
@@ -739,4 +742,4 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
|
|
| 739 |
if verbose:
|
| 740 |
print(str(children))
|
| 741 |
|
| 742 |
-
return children
|
|
|
|
| 413 |
|
| 414 |
def extract_materials(self, text):
|
| 415 |
preprocessed_text = text.strip()
|
| 416 |
+
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
| 417 |
+
"processText_disable_linking")
|
| 418 |
|
| 419 |
if status != 200:
|
| 420 |
result = {}
|
|
|
|
| 680 |
|
| 681 |
return output_data
|
| 682 |
|
| 683 |
+
|
| 684 |
def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
| 685 |
children = []
|
| 686 |
|
|
|
|
| 699 |
|
| 700 |
return children
|
| 701 |
|
| 702 |
+
|
| 703 |
def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
|
| 704 |
children = []
|
| 705 |
|
|
|
|
| 742 |
if verbose:
|
| 743 |
print(str(children))
|
| 744 |
|
| 745 |
+
return children
|
streamlit_app.py
CHANGED
|
@@ -42,6 +42,7 @@ if 'git_rev' not in st.session_state:
|
|
| 42 |
if "messages" not in st.session_state:
|
| 43 |
st.session_state.messages = []
|
| 44 |
|
|
|
|
| 45 |
def new_file():
|
| 46 |
st.session_state['loaded_embeddings'] = None
|
| 47 |
st.session_state['doc_id'] = None
|
|
@@ -69,6 +70,7 @@ def init_qa(model):
|
|
| 69 |
|
| 70 |
return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
|
| 71 |
|
|
|
|
| 72 |
@st.cache_resource
|
| 73 |
def init_ner():
|
| 74 |
quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
|
|
@@ -89,14 +91,16 @@ def init_ner():
|
|
| 89 |
materials_client.set_config(config_materials)
|
| 90 |
|
| 91 |
gqa = GrobidAggregationProcessor(None,
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
|
| 96 |
return gqa
|
| 97 |
|
|
|
|
| 98 |
gqa = init_ner()
|
| 99 |
|
|
|
|
| 100 |
def get_file_hash(fname):
|
| 101 |
hash_md5 = blake2b()
|
| 102 |
with open(fname, "rb") as f:
|
|
@@ -122,7 +126,7 @@ def play_old_messages():
|
|
| 122 |
is_api_key_provided = st.session_state['api_key']
|
| 123 |
|
| 124 |
model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
|
| 125 |
-
("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1")
|
| 126 |
index=1,
|
| 127 |
captions=[
|
| 128 |
"ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
|
|
@@ -134,13 +138,15 @@ model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
|
|
| 134 |
|
| 135 |
if not st.session_state['api_key']:
|
| 136 |
if model == 'mistral-7b-instruct-v0.1' or model == 'llama-2-70b-chat':
|
| 137 |
-
api_key = st.sidebar.text_input('Huggingface API Key',
|
|
|
|
| 138 |
if api_key:
|
| 139 |
st.session_state['api_key'] = is_api_key_provided = True
|
| 140 |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
|
| 141 |
st.session_state['rqa'] = init_qa(model)
|
| 142 |
elif model == 'chatgpt-3.5-turbo':
|
| 143 |
-
api_key = st.sidebar.text_input('OpenAI API Key',
|
|
|
|
| 144 |
if api_key:
|
| 145 |
st.session_state['api_key'] = is_api_key_provided = True
|
| 146 |
os.environ['OPENAI_API_KEY'] = api_key
|
|
@@ -177,10 +183,12 @@ with st.sidebar:
|
|
| 177 |
st.markdown(
|
| 178 |
"""After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
|
| 179 |
|
| 180 |
-
st.markdown(
|
|
|
|
|
|
|
| 181 |
if st.session_state['git_rev'] != "unknown":
|
| 182 |
st.markdown("**Revision number**: [" + st.session_state[
|
| 183 |
-
|
| 184 |
|
| 185 |
st.header("Query mode (Advanced use)")
|
| 186 |
st.markdown(
|
|
@@ -219,11 +227,11 @@ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.
|
|
| 219 |
if mode == "Embeddings":
|
| 220 |
with st.spinner("Generating LLM response..."):
|
| 221 |
text_response = st.session_state['rqa'].query_storage(question, st.session_state.doc_id,
|
| 222 |
-
|
| 223 |
elif mode == "LLM":
|
| 224 |
with st.spinner("Generating response..."):
|
| 225 |
_, text_response = st.session_state['rqa'].query_document(question, st.session_state.doc_id,
|
| 226 |
-
|
| 227 |
|
| 228 |
if not text_response:
|
| 229 |
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
|
|
|
|
| 42 |
if "messages" not in st.session_state:
|
| 43 |
st.session_state.messages = []
|
| 44 |
|
| 45 |
+
|
| 46 |
def new_file():
|
| 47 |
st.session_state['loaded_embeddings'] = None
|
| 48 |
st.session_state['doc_id'] = None
|
|
|
|
| 70 |
|
| 71 |
return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
|
| 72 |
|
| 73 |
+
|
| 74 |
@st.cache_resource
|
| 75 |
def init_ner():
|
| 76 |
quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
|
|
|
|
| 91 |
materials_client.set_config(config_materials)
|
| 92 |
|
| 93 |
gqa = GrobidAggregationProcessor(None,
|
| 94 |
+
grobid_quantities_client=quantities_client,
|
| 95 |
+
grobid_superconductors_client=materials_client
|
| 96 |
+
)
|
| 97 |
|
| 98 |
return gqa
|
| 99 |
|
| 100 |
+
|
| 101 |
gqa = init_ner()
|
| 102 |
|
| 103 |
+
|
| 104 |
def get_file_hash(fname):
|
| 105 |
hash_md5 = blake2b()
|
| 106 |
with open(fname, "rb") as f:
|
|
|
|
| 126 |
is_api_key_provided = st.session_state['api_key']
|
| 127 |
|
| 128 |
model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
|
| 129 |
+
("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1"), # , "llama-2-70b-chat"),
|
| 130 |
index=1,
|
| 131 |
captions=[
|
| 132 |
"ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
|
|
|
|
| 138 |
|
| 139 |
if not st.session_state['api_key']:
|
| 140 |
if model == 'mistral-7b-instruct-v0.1' or model == 'llama-2-70b-chat':
|
| 141 |
+
api_key = st.sidebar.text_input('Huggingface API Key',
|
| 142 |
+
type="password") # if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ else os.environ['HUGGINGFACEHUB_API_TOKEN']
|
| 143 |
if api_key:
|
| 144 |
st.session_state['api_key'] = is_api_key_provided = True
|
| 145 |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
|
| 146 |
st.session_state['rqa'] = init_qa(model)
|
| 147 |
elif model == 'chatgpt-3.5-turbo':
|
| 148 |
+
api_key = st.sidebar.text_input('OpenAI API Key',
|
| 149 |
+
type="password") # if 'OPENAI_API_KEY' not in os.environ else os.environ['OPENAI_API_KEY']
|
| 150 |
if api_key:
|
| 151 |
st.session_state['api_key'] = is_api_key_provided = True
|
| 152 |
os.environ['OPENAI_API_KEY'] = api_key
|
|
|
|
| 183 |
st.markdown(
|
| 184 |
"""After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
|
| 185 |
|
| 186 |
+
st.markdown(
|
| 187 |
+
'**NER on LLM responses**: The responses from the LLMs are post-processed to extract <span style="color:orange">physical quantities, measurements</span> and <span style="color:green">materials</span> mentions.',
|
| 188 |
+
unsafe_allow_html=True)
|
| 189 |
if st.session_state['git_rev'] != "unknown":
|
| 190 |
st.markdown("**Revision number**: [" + st.session_state[
|
| 191 |
+
'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
|
| 192 |
|
| 193 |
st.header("Query mode (Advanced use)")
|
| 194 |
st.markdown(
|
|
|
|
| 227 |
if mode == "Embeddings":
|
| 228 |
with st.spinner("Generating LLM response..."):
|
| 229 |
text_response = st.session_state['rqa'].query_storage(question, st.session_state.doc_id,
|
| 230 |
+
context_size=context_size)
|
| 231 |
elif mode == "LLM":
|
| 232 |
with st.spinner("Generating response..."):
|
| 233 |
_, text_response = st.session_state['rqa'].query_document(question, st.session_state.doc_id,
|
| 234 |
+
context_size=context_size)
|
| 235 |
|
| 236 |
if not text_response:
|
| 237 |
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
|