Spaces:
Running
Running
Merge branch 'main' into add-pdf-viewer
Browse files- .devcontainer/devcontainer.json +33 -0
- .gitignore +5 -1
- CHANGELOG.md +26 -0
- README.md +16 -8
- document_qa/document_qa_engine.py +1 -0
- pyproject.toml +1 -1
- requirements.txt +6 -6
- streamlit_app.py +24 -17
.devcontainer/devcontainer.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "Python 3",
|
3 |
+
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
4 |
+
"image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
|
5 |
+
"customizations": {
|
6 |
+
"codespaces": {
|
7 |
+
"openFiles": [
|
8 |
+
"README.md",
|
9 |
+
"streamlit_app.py"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
"vscode": {
|
13 |
+
"settings": {},
|
14 |
+
"extensions": [
|
15 |
+
"ms-python.python",
|
16 |
+
"ms-python.vscode-pylance"
|
17 |
+
]
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
|
21 |
+
"postAttachCommand": {
|
22 |
+
"server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
|
23 |
+
},
|
24 |
+
"portsAttributes": {
|
25 |
+
"8501": {
|
26 |
+
"label": "Application",
|
27 |
+
"onAutoForward": "openPreview"
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"forwardPorts": [
|
31 |
+
8501
|
32 |
+
]
|
33 |
+
}
|
.gitignore
CHANGED
@@ -3,4 +3,8 @@
|
|
3 |
.env.docker
|
4 |
**/**/.chroma
|
5 |
resources/db
|
6 |
-
build
|
|
|
|
|
|
|
|
|
|
3 |
.env.docker
|
4 |
**/**/.chroma
|
5 |
resources/db
|
6 |
+
build
|
7 |
+
dist
|
8 |
+
__pycache__
|
9 |
+
document_qa/__pycache__
|
10 |
+
document_qa_engine.egg-info/
|
CHANGELOG.md
CHANGED
@@ -4,6 +4,32 @@ All notable changes to this project will be documented in this file.
|
|
4 |
|
5 |
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
## [0.3.1] - 2023-11-22
|
8 |
|
9 |
### Added
|
|
|
4 |
|
5 |
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
6 |
|
7 |
+
## [0.3.4] - 2023-12-16
|
8 |
+
|
9 |
+
## Added
|
10 |
+
|
11 |
+
+ Add gpt4 and gpt4-turbo
|
12 |
+
|
13 |
+
## Changed
|
14 |
+
|
15 |
+
+ improved UI: replace combo boxes with dropdown box
|
16 |
+
|
17 |
+
## [0.3.3] - 2023-12-14
|
18 |
+
|
19 |
+
### Added
|
20 |
+
|
21 |
+
+ Add experimental PDF rendering in the page
|
22 |
+
|
23 |
+
### Fixed
|
24 |
+
|
25 |
+
+ Fix GrobidProcessors API implementation
|
26 |
+
|
27 |
+
## [0.3.2] - 2023-12-01
|
28 |
+
|
29 |
+
### Fixed
|
30 |
+
|
31 |
+
+ Remove memory when using Zephyr-7b-beta, that easily hallucinate
|
32 |
+
|
33 |
## [0.3.1] - 2023-11-22
|
34 |
|
35 |
### Added
|
README.md
CHANGED
@@ -16,9 +16,11 @@ license: apache-2.0
|
|
16 |
|
17 |
<img src="https://github.com/lfoppiano/document-qa/assets/15426/f0a04a86-96b3-406e-8303-904b93f00015" width=300 align="right" />
|
18 |
|
|
|
|
|
19 |
## Introduction
|
20 |
|
21 |
-
Question/Answering on scientific documents using LLMs: ChatGPT-3.5-turbo, Mistral-7b-instruct and Zephyr-7b-beta.
|
22 |
The streamlit application demonstrates the implementation of a RAG (Retrieval Augmented Generation) on scientific documents, that we are developing at NIMS (National Institute for Materials Science), in Tsukuba, Japan.
|
23 |
Different to most of the projects, we focus on scientific articles.
|
24 |
We target only the full-text using [Grobid](https://github.com/kermitt2/grobid) which provides cleaner results than the raw PDF2Text converter (which is comparable with most of other solutions).
|
@@ -29,11 +31,6 @@ The conversation is kept in memory by a buffered sliding window memory (top 4 mo
|
|
29 |
|
30 |
(The image on the right was generated with https://huggingface.co/spaces/stabilityai/stable-diffusion)
|
31 |
|
32 |
-
**Demos**:
|
33 |
-
- (stable version): https://lfoppiano-document-qa.hf.space/
|
34 |
-
- (unstable version): https://document-insights.streamlit.app/
|
35 |
-
|
36 |
-
|
37 |
|
38 |
[<img src="https://img.youtube.com/vi/M4UaYs5WKGs/hqdefault.jpg" height="300" align="right"
|
39 |
/>](https://www.youtube.com/embed/M4UaYs5WKGs)
|
@@ -41,7 +38,7 @@ The conversation is kept in memory by a buffered sliding window memory (top 4 mo
|
|
41 |
## Getting started
|
42 |
|
43 |
- Select the model+embedding combination you want to use
|
44 |
-
- If using
|
45 |
- Upload a scientific article as a PDF document. You will see a spinner or loading indicator while the processing is in progress.
|
46 |
- Once the spinner disappears, you can proceed to ask your questions
|
47 |
|
@@ -68,11 +65,22 @@ Indicates whether sending a question to the LLM (Language Model) or to the vecto
|
|
68 |
- Embeddings: the response will consist of the raw text from the document related to the question (based on the embeddings). This mode helps to test why sometimes the answers are not satisfying or incomplete.
|
69 |
|
70 |
### NER (Named Entities Recognition)
|
71 |
-
|
72 |
This feature is specifically crafted for people working with scientific documents in materials science.
|
73 |
It enables to run NER on the response from the LLM, to identify materials mentions and properties (quantities, measurements).
|
74 |
This feature leverages both [grobid-quantities](https://github.com/kermitt2/grobid-quanities) and [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors) external services.
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
## Development notes
|
78 |
|
|
|
16 |
|
17 |
<img src="https://github.com/lfoppiano/document-qa/assets/15426/f0a04a86-96b3-406e-8303-904b93f00015" width=300 align="right" />
|
18 |
|
19 |
+
https://lfoppiano-document-qa.hf.space/
|
20 |
+
|
21 |
## Introduction
|
22 |
|
23 |
+
Question/Answering on scientific documents using LLMs: ChatGPT-3.5-turbo, GPT4, GPT4-Turbo, Mistral-7b-instruct and Zephyr-7b-beta.
|
24 |
The streamlit application demonstrates the implementation of a RAG (Retrieval Augmented Generation) on scientific documents, that we are developing at NIMS (National Institute for Materials Science), in Tsukuba, Japan.
|
25 |
Different to most of the projects, we focus on scientific articles.
|
26 |
We target only the full-text using [Grobid](https://github.com/kermitt2/grobid) which provides cleaner results than the raw PDF2Text converter (which is comparable with most of other solutions).
|
|
|
31 |
|
32 |
(The image on the right was generated with https://huggingface.co/spaces/stabilityai/stable-diffusion)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
[<img src="https://img.youtube.com/vi/M4UaYs5WKGs/hqdefault.jpg" height="300" align="right"
|
36 |
/>](https://www.youtube.com/embed/M4UaYs5WKGs)
|
|
|
38 |
## Getting started
|
39 |
|
40 |
- Select the model+embedding combination you want to use
|
41 |
+
- If using gpt3.5-turbo, gpt4 or gpt4-turbo, enter your API Key ([Open AI](https://platform.openai.com/account/api-keys)).
|
42 |
- Upload a scientific article as a PDF document. You will see a spinner or loading indicator while the processing is in progress.
|
43 |
- Once the spinner disappears, you can proceed to ask your questions
|
44 |
|
|
|
65 |
- Embeddings: the response will consist of the raw text from the document related to the question (based on the embeddings). This mode helps to test why sometimes the answers are not satisfying or incomplete.
|
66 |
|
67 |
### NER (Named Entities Recognition)
|
|
|
68 |
This feature is specifically crafted for people working with scientific documents in materials science.
|
69 |
It enables to run NER on the response from the LLM, to identify materials mentions and properties (quantities, measurements).
|
70 |
This feature leverages both [grobid-quantities](https://github.com/kermitt2/grobid-quanities) and [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors) external services.
|
71 |
|
72 |
+
### Troubleshooting
|
73 |
+
Error: `streamlit: Your system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0`.
|
74 |
+
Here the [solution on Linux](https://stackoverflow.com/questions/76958817/streamlit-your-system-has-an-unsupported-version-of-sqlite3-chroma-requires-sq).
|
75 |
+
For more information, see the [details](https://docs.trychroma.com/troubleshooting#sqlite) on Chroma website.
|
76 |
+
|
77 |
+
## Disclaimer on Data, Security, and Privacy ⚠️
|
78 |
+
|
79 |
+
Please read carefully:
|
80 |
+
|
81 |
+
- Avoid uploading sensitive data. We temporarily store text from the uploaded PDF documents only for processing your request, and we disclaim any responsibility for subsequent use or handling of the submitted data by third-party LLMs.
|
82 |
+
- Mistral and Zephyr are FREE to use and do not require any API, but as we leverage the free API entrypoint, there is no guarantee that all requests will go through. Use at your own risk.
|
83 |
+
- We do not assume responsibility for how the data is utilized by the LLM end-points API.
|
84 |
|
85 |
## Development notes
|
86 |
|
document_qa/document_qa_engine.py
CHANGED
@@ -16,6 +16,7 @@ from langchain.vectorstores import Chroma
|
|
16 |
from tqdm import tqdm
|
17 |
|
18 |
|
|
|
19 |
class DocumentQAEngine:
|
20 |
llm = None
|
21 |
qa_chain_type = None
|
|
|
16 |
from tqdm import tqdm
|
17 |
|
18 |
|
19 |
+
|
20 |
class DocumentQAEngine:
|
21 |
llm = None
|
22 |
qa_chain_type = None
|
pyproject.toml
CHANGED
@@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm"]
|
|
3 |
build-backend = "setuptools.build_meta"
|
4 |
|
5 |
[tool.bumpversion]
|
6 |
-
current_version = "0.3.
|
7 |
commit = "true"
|
8 |
tag = "true"
|
9 |
tag_name = "v{new_version}"
|
|
|
3 |
build-backend = "setuptools.build_meta"
|
4 |
|
5 |
[tool.bumpversion]
|
6 |
+
current_version = "0.3.3"
|
7 |
commit = "true"
|
8 |
tag = "true"
|
9 |
tag_name = "v{new_version}"
|
requirements.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
# Grobid
|
2 |
grobid-quantities-client==0.4.0
|
3 |
-
grobid-client-python==0.0.
|
4 |
grobid_tei_xml==0.1.3
|
5 |
|
6 |
# Utils
|
7 |
-
tqdm
|
8 |
-
pyyaml==6.0
|
9 |
-
pytest
|
10 |
streamlit==1.29.0
|
11 |
lxml
|
12 |
Beautifulsoup4
|
@@ -15,10 +15,10 @@ watchdog
|
|
15 |
dateparser
|
16 |
|
17 |
# LLM
|
18 |
-
chromadb==0.4.
|
19 |
tiktoken==0.4.0
|
20 |
openai==0.27.7
|
21 |
-
langchain==0.0.
|
22 |
typing-inspect==0.9.0
|
23 |
typing_extensions==4.8.0
|
24 |
pydantic==2.4.2
|
|
|
1 |
# Grobid
|
2 |
grobid-quantities-client==0.4.0
|
3 |
+
grobid-client-python==0.0.7
|
4 |
grobid_tei_xml==0.1.3
|
5 |
|
6 |
# Utils
|
7 |
+
tqdm==4.66.1
|
8 |
+
pyyaml==6.0.1
|
9 |
+
pytest==7.4.3
|
10 |
streamlit==1.29.0
|
11 |
lxml
|
12 |
Beautifulsoup4
|
|
|
15 |
dateparser
|
16 |
|
17 |
# LLM
|
18 |
+
chromadb==0.4.19
|
19 |
tiktoken==0.4.0
|
20 |
openai==0.27.7
|
21 |
+
langchain==0.0.350
|
22 |
typing-inspect==0.9.0
|
23 |
typing_extensions==4.8.0
|
24 |
pydantic==2.4.2
|
streamlit_app.py
CHANGED
@@ -19,6 +19,10 @@ from document_qa.document_qa_engine import DocumentQAEngine
|
|
19 |
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
|
20 |
from grobid_client_generic import GrobidClientGeneric
|
21 |
|
|
|
|
|
|
|
|
|
22 |
if 'rqa' not in st.session_state:
|
23 |
st.session_state['rqa'] = {}
|
24 |
|
@@ -123,17 +127,17 @@ def clear_memory():
|
|
123 |
# @st.cache_resource
|
124 |
def init_qa(model, api_key=None):
|
125 |
## For debug add: callbacks=[PromptLayerCallbackHandler(pl_tags=["langchain", "chatgpt", "document-qa"])])
|
126 |
-
if model
|
127 |
st.session_state['memory'] = ConversationBufferWindowMemory(k=4)
|
128 |
if api_key:
|
129 |
-
chat = ChatOpenAI(model_name=
|
130 |
temperature=0,
|
131 |
openai_api_key=api_key,
|
132 |
frequency_penalty=0.1)
|
133 |
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
|
134 |
|
135 |
else:
|
136 |
-
chat = ChatOpenAI(model_name=
|
137 |
temperature=0,
|
138 |
frequency_penalty=0.1)
|
139 |
embeddings = OpenAIEmbeddings()
|
@@ -212,20 +216,23 @@ def play_old_messages():
|
|
212 |
# is_api_key_provided = st.session_state['api_key']
|
213 |
|
214 |
with st.sidebar:
|
215 |
-
st.session_state['model'] = model = st.
|
216 |
-
"Model",
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
"
|
221 |
-
"
|
222 |
-
"
|
223 |
],
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
226 |
|
227 |
st.markdown(
|
228 |
-
":warning:
|
229 |
|
230 |
if (model == 'mistral-7b-instruct-v0.1' or model == 'zephyr-7b-beta') and model not in st.session_state['api_keys']:
|
231 |
if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
|
@@ -244,7 +251,7 @@ with st.sidebar:
|
|
244 |
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
|
245 |
st.session_state['rqa'][model] = init_qa(model)
|
246 |
|
247 |
-
elif model
|
248 |
if 'OPENAI_API_KEY' not in os.environ:
|
249 |
api_key = st.text_input('OpenAI API Key', type="password")
|
250 |
st.markdown("Get it [here](https://platform.openai.com/account/api-keys)")
|
@@ -303,9 +310,9 @@ with st.sidebar:
|
|
303 |
help="Number of chunks to consider when answering a question",
|
304 |
disabled=not uploaded_file)
|
305 |
|
306 |
-
st.session_state['ner_processing'] = st.checkbox("
|
307 |
st.markdown(
|
308 |
-
'
|
309 |
unsafe_allow_html=True)
|
310 |
|
311 |
st.divider()
|
|
|
19 |
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
|
20 |
from grobid_client_generic import GrobidClientGeneric
|
21 |
|
22 |
+
OPENAI_MODELS = ['chatgpt-3.5-turbo',
|
23 |
+
"gpt-4",
|
24 |
+
"gpt-4-1106-preview"]
|
25 |
+
|
26 |
if 'rqa' not in st.session_state:
|
27 |
st.session_state['rqa'] = {}
|
28 |
|
|
|
127 |
# @st.cache_resource
|
128 |
def init_qa(model, api_key=None):
|
129 |
## For debug add: callbacks=[PromptLayerCallbackHandler(pl_tags=["langchain", "chatgpt", "document-qa"])])
|
130 |
+
if model in OPENAI_MODELS:
|
131 |
st.session_state['memory'] = ConversationBufferWindowMemory(k=4)
|
132 |
if api_key:
|
133 |
+
chat = ChatOpenAI(model_name=model,
|
134 |
temperature=0,
|
135 |
openai_api_key=api_key,
|
136 |
frequency_penalty=0.1)
|
137 |
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
|
138 |
|
139 |
else:
|
140 |
+
chat = ChatOpenAI(model_name=model,
|
141 |
temperature=0,
|
142 |
frequency_penalty=0.1)
|
143 |
embeddings = OpenAIEmbeddings()
|
|
|
216 |
# is_api_key_provided = st.session_state['api_key']
|
217 |
|
218 |
with st.sidebar:
|
219 |
+
st.session_state['model'] = model = st.selectbox(
|
220 |
+
"Model:",
|
221 |
+
options=[
|
222 |
+
"chatgpt-3.5-turbo",
|
223 |
+
"mistral-7b-instruct-v0.1",
|
224 |
+
"zephyr-7b-beta",
|
225 |
+
"gpt-4",
|
226 |
+
"gpt-4-1106-preview"
|
227 |
],
|
228 |
+
index=2,
|
229 |
+
placeholder="Select model",
|
230 |
+
help="Select the LLM model:",
|
231 |
+
disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
|
232 |
+
)
|
233 |
|
234 |
st.markdown(
|
235 |
+
":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa/tree/review-interface#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
|
236 |
|
237 |
if (model == 'mistral-7b-instruct-v0.1' or model == 'zephyr-7b-beta') and model not in st.session_state['api_keys']:
|
238 |
if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
|
|
|
251 |
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
|
252 |
st.session_state['rqa'][model] = init_qa(model)
|
253 |
|
254 |
+
elif model in OPENAI_MODELS and model not in st.session_state['api_keys']:
|
255 |
if 'OPENAI_API_KEY' not in os.environ:
|
256 |
api_key = st.text_input('OpenAI API Key', type="password")
|
257 |
st.markdown("Get it [here](https://platform.openai.com/account/api-keys)")
|
|
|
310 |
help="Number of chunks to consider when answering a question",
|
311 |
disabled=not uploaded_file)
|
312 |
|
313 |
+
st.session_state['ner_processing'] = st.checkbox("Identify materials and properties.")
|
314 |
st.markdown(
|
315 |
+
'The LLM responses undergo post-processing to extract <span style="color:orange">physical quantities, measurements</span>, and <span style="color:green">materials</span> mentions.',
|
316 |
unsafe_allow_html=True)
|
317 |
|
318 |
st.divider()
|