Abhilash V J commited on
Commit
bd5eb62
β€’
1 Parent(s): 458615d

Added file uplaod option

Browse files
.gitattributes CHANGED
@@ -1,34 +1,34 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/Test DB-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
.streamlit/secrets.toml ADDED
@@ -0,0 +1 @@
 
 
1
+ pinecone_apikey = "6a76246e-2b5f-46f5-aab8-9cf43d6c94fb"
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Haystack QA
3
- emoji: πŸ“š
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.15.2
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Haystack QA
3
+ emoji: πŸ“š
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.15.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Test DB.ipynb ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "5736235d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from haystack.document_stores import PineconeDocumentStore"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "c4925511",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "document_store = PineconeDocumentStore(\n",
21
+ " api_key= \"6a76246e-2b5f-46f5-aab8-9cf43d6c94fb\",\n",
22
+ " index='qa_demo',\n",
23
+ " similarity=\"cosine\",\n",
24
+ " embedding_dim=768\n",
25
+ " )"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "id": "c884286a",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "document_store.get"
36
+ ]
37
+ }
38
+ ],
39
+ "metadata": {
40
+ "kernelspec": {
41
+ "display_name": "Python 3 (ipykernel)",
42
+ "language": "python",
43
+ "name": "python3"
44
+ },
45
+ "language_info": {
46
+ "codemirror_mode": {
47
+ "name": "ipython",
48
+ "version": 3
49
+ },
50
+ "file_extension": ".py",
51
+ "mimetype": "text/x-python",
52
+ "name": "python",
53
+ "nbconvert_exporter": "python",
54
+ "pygments_lexer": "ipython3",
55
+ "version": "3.8.10"
56
+ }
57
+ },
58
+ "nbformat": 4,
59
+ "nbformat_minor": 5
60
+ }
app.py CHANGED
@@ -1,172 +1,241 @@
1
- import os
2
- import sys
3
- import logging
4
- from pathlib import Path
5
- from json import JSONDecodeError
6
- import pandas as pd
7
- import streamlit as st
8
- from annotated_text import annotation
9
- from markdown import markdown
10
- import json
11
- from haystack import Document
12
- import pandas as pd
13
- from haystack.document_stores import PineconeDocumentStore
14
- from haystack.nodes import EmbeddingRetriever, FARMReader
15
- from haystack.pipelines import ExtractiveQAPipeline
16
-
17
- # @st.cache
18
- def create_doc_store():
19
- document_store = PineconeDocumentStore(
20
- api_key= st.secrets["pinecone_apikey"],
21
- index='qa_demo',
22
- similarity="cosine",
23
- embedding_dim=768
24
- )
25
- return document_store
26
-
27
- # @st.cache
28
- def create_pipe(document_store):
29
- retriever = EmbeddingRetriever(
30
- document_store=document_store,
31
- embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
32
- model_format="sentence_transformers",
33
- )
34
- reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
35
- pipe = ExtractiveQAPipeline(reader, retriever)
36
- return pipe
37
-
38
- def query(pipe, question, top_k_reader, top_k_retriever):
39
- res = pipe.run(
40
- query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
41
- )
42
- answer_df = []
43
- # for r in res['answers']:
44
- # ans_dict = res['answers'][0].meta
45
- # ans_dict["answer"] = r.context
46
- # answer_df.append(ans_dict)
47
- # result = pd.DataFrame(answer_df)
48
- # result.columns = ["Source","Title","Year","Link","Answer"]
49
- # result[["Answer","Link","Source","Title","Year"]]
50
- return res
51
-
52
- document_store = create_doc_store()
53
- pipe = create_pipe(document_store)
54
-
55
- def set_state_if_absent(key, value):
56
- if key not in st.session_state:
57
- st.session_state[key] = value
58
-
59
- # Adjust to a question that you would like users to see in the search bar when they load the UI:
60
- DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
61
- DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")
62
-
63
- # Sliders
64
- DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
65
- DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
66
-
67
-
68
- st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
69
-
70
- # Persistent state
71
- set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
72
- set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
73
- set_state_if_absent("results", None)
74
-
75
-
76
- # Small callback to reset the interface in case the text of the question changes
77
- def reset_results(*args):
78
- st.session_state.answer = None
79
- st.session_state.results = None
80
- st.session_state.raw_json = None
81
-
82
- # Title
83
- st.write("# Haystack Search Demo")
84
- st.markdown(
85
- """
86
- This demo takes its data from two sample data csv with statistics on various topics. \n
87
- Ask any question on this topic and see if Haystack can find the correct answer to your query! \n
88
- *Note: do not use keywords, but full-fledged questions.* The demo is not optimized to deal with keyword queries and might misunderstand you.
89
- """,
90
- unsafe_allow_html=True,
91
- )
92
-
93
- # Sidebar
94
- st.sidebar.header("Options")
95
- top_k_reader = st.sidebar.slider(
96
- "Max. number of answers",
97
- min_value=1,
98
- max_value=10,
99
- value=DEFAULT_NUMBER_OF_ANSWERS,
100
- step=1,
101
- on_change=reset_results,
102
- )
103
- top_k_retriever = st.sidebar.slider(
104
- "Max. number of documents from retriever",
105
- min_value=1,
106
- max_value=10,
107
- value=DEFAULT_DOCS_FROM_RETRIEVER,
108
- step=1,
109
- on_change=reset_results,
110
- )
111
- # data_files = st.file_uploader(
112
- # "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
113
- # )
114
- # for data_file in data_files:
115
- # # Upload file
116
- # if data_file:
117
- # raw_json = upload_doc(data_file)
118
-
119
- question = st.text_input(
120
- value=st.session_state.question,
121
- max_chars=100,
122
- on_change=reset_results,
123
- label="question",
124
- label_visibility="hidden",
125
- )
126
- col1, col2 = st.columns(2)
127
- col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
128
- col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
129
-
130
- # Run button
131
- run_pressed = col1.button("Run")
132
- if run_pressed:
133
-
134
- run_query = (
135
- run_pressed or question != st.session_state.question
136
- )
137
- # Get results for query
138
- if run_query and question:
139
- reset_results()
140
- st.session_state.question = question
141
-
142
- with st.spinner(
143
- "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
144
- ):
145
- try:
146
- st.session_state.results = query(
147
- pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
148
- )
149
- except JSONDecodeError as je:
150
- st.error("πŸ‘“ &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
151
- except Exception as e:
152
- logging.exception(e)
153
- if "The server is busy processing requests" in str(e) or "503" in str(e):
154
- st.error("πŸ§‘β€πŸŒΎ &nbsp;&nbsp; All our workers are busy! Try again later.")
155
- else:
156
- st.error(f"🐞 &nbsp;&nbsp; An error occurred during the request. {str(e)}")
157
-
158
-
159
- if st.session_state.results:
160
-
161
- st.write("## Results:")
162
-
163
- for count, result in enumerate(st.session_state.results['answers']):
164
- answer, context = result.answer, result.context
165
- start_idx = context.find(answer)
166
- end_idx = start_idx + len(answer)
167
- source = f"[{result.meta['Title']}]({result.meta['link']})"
168
- # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
169
- st.write(
170
- markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
171
- unsafe_allow_html=True,
172
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ from pathlib import Path
5
+ from json import JSONDecodeError
6
+ import pandas as pd
7
+ import streamlit as st
8
+ from annotated_text import annotation
9
+ from markdown import markdown
10
+ import json
11
+ from haystack import Document
12
+ import pandas as pd
13
+ from haystack.document_stores import PineconeDocumentStore
14
+ from haystack.nodes import EmbeddingRetriever, FARMReader
15
+ from haystack.pipelines import ExtractiveQAPipeline
16
+ import shutil
17
+ import uuid
18
+ from pathlib import Path
19
+ from haystack.pipelines import Pipeline
20
+ from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
21
+
22
+
23
+ preprocessor = PreProcessor(
24
+ clean_empty_lines=True,
25
+ clean_whitespace=True,
26
+ clean_header_footer=False,
27
+ split_by="word",
28
+ split_length=100,
29
+ split_respect_sentence_boundary=True
30
+ )
31
+ file_type_classifier = FileTypeClassifier()
32
+ text_converter = TextConverter()
33
+ pdf_converter = PDFToTextConverter()
34
+ docx_converter = DocxToTextConverter()
35
+
36
+
37
+ FILE_UPLOAD_PATH= "./data/uploads/"
38
+ os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
39
+ # @st.cache
40
+ def create_doc_store():
41
+ document_store = PineconeDocumentStore(
42
+ api_key= st.secrets["pinecone_apikey"],
43
+ index='qa_demo',
44
+ similarity="cosine",
45
+ embedding_dim=768
46
+ )
47
+ return document_store
48
+
49
+ # @st.cache
50
+ # def create_pipe(document_store):
51
+ # retriever = EmbeddingRetriever(
52
+ # document_store=document_store,
53
+ # embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
54
+ # model_format="sentence_transformers",
55
+ # )
56
+ # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
57
+ # pipe = ExtractiveQAPipeline(reader, retriever)
58
+ # return pipe
59
+
60
+ def query(pipe, question, top_k_reader, top_k_retriever):
61
+ res = pipe.run(
62
+ query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
63
+ )
64
+ answer_df = []
65
+ # for r in res['answers']:
66
+ # ans_dict = res['answers'][0].meta
67
+ # ans_dict["answer"] = r.context
68
+ # answer_df.append(ans_dict)
69
+ # result = pd.DataFrame(answer_df)
70
+ # result.columns = ["Source","Title","Year","Link","Answer"]
71
+ # result[["Answer","Link","Source","Title","Year"]]
72
+ return res
73
+
74
+ document_store = create_doc_store()
75
+ # pipe = create_pipe(document_store)
76
+ retriever = EmbeddingRetriever(
77
+ document_store=document_store,
78
+ embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
79
+ model_format="sentence_transformers",
80
+ )
81
+ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
82
+ pipe = ExtractiveQAPipeline(reader, retriever)
83
+
84
+ indexing_pipeline_with_classification = Pipeline()
85
+ indexing_pipeline_with_classification.add_node(
86
+ component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
87
+ )
88
+ indexing_pipeline_with_classification.add_node(
89
+ component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
90
+ )
91
+ indexing_pipeline_with_classification.add_node(
92
+ component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
93
+ )
94
+ indexing_pipeline_with_classification.add_node(
95
+ component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
96
+ )
97
+ indexing_pipeline_with_classification.add_node(
98
+ component=preprocessor,
99
+ name="Preprocessor",
100
+ inputs=["TextConverter", "PdfConverter", "DocxConverter"],
101
+ )
102
+ indexing_pipeline_with_classification.add_node(
103
+ component=document_store, name="DocumentStore", inputs=["Preprocessor"]
104
+ )
105
+
106
+ def set_state_if_absent(key, value):
107
+ if key not in st.session_state:
108
+ st.session_state[key] = value
109
+
110
+ # Adjust to a question that you would like users to see in the search bar when they load the UI:
111
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
112
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")
113
+
114
+ # Sliders
115
+ DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
116
+ DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
117
+
118
+
119
+ st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
120
+
121
+ # Persistent state
122
+ set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
123
+ set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
124
+ set_state_if_absent("results", None)
125
+
126
+
127
+ # Small callback to reset the interface in case the text of the question changes
128
+ def reset_results(*args):
129
+ st.session_state.answer = None
130
+ st.session_state.results = None
131
+ st.session_state.raw_json = None
132
+
133
+ # Title
134
+ st.write("# Haystack Search Demo")
135
+ st.markdown(
136
+ """
137
+ This demo takes its data from two sample data csv with statistics on various topics. \n
138
+ Ask any question on this topic and see if Haystack can find the correct answer to your query! \n
139
+ *Note: do not use keywords, but full-fledged questions.* The demo is not optimized to deal with keyword queries and might misunderstand you.
140
+ """,
141
+ unsafe_allow_html=True,
142
+ )
143
+
144
+ # Sidebar
145
+ st.sidebar.header("Options")
146
+ st.sidebar.write("## File Upload:")
147
+ data_files = st.sidebar.file_uploader(
148
+ "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
149
+ )
150
+ ALL_FILES = []
151
+ for data_file in data_files:
152
+ # Upload file
153
+ if data_file:
154
+ file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
155
+ with file_path.open("wb") as buffer:
156
+ shutil.copyfileobj(data_file.file, buffer)
157
+ ALL_FILES.append(file_path)
158
+ st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; βœ… ")
159
+ indexing_pipeline_with_classification.run(file_paths=ALL_FILES)
160
+
161
+ if len(ALL_FILES) > 0:
162
+ document_store.update_embeddings(retriever, update_existing_embeddings=False)
163
+
164
+ top_k_reader = st.sidebar.slider(
165
+ "Max. number of answers",
166
+ min_value=1,
167
+ max_value=10,
168
+ value=DEFAULT_NUMBER_OF_ANSWERS,
169
+ step=1,
170
+ on_change=reset_results,
171
+ )
172
+ top_k_retriever = st.sidebar.slider(
173
+ "Max. number of documents from retriever",
174
+ min_value=1,
175
+ max_value=10,
176
+ value=DEFAULT_DOCS_FROM_RETRIEVER,
177
+ step=1,
178
+ on_change=reset_results,
179
+ )
180
+ # data_files = st.file_uploader(
181
+ # "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
182
+ # )
183
+ # for data_file in data_files:
184
+ # # Upload file
185
+ # if data_file:
186
+ # raw_json = upload_doc(data_file)
187
+
188
+ question = st.text_input(
189
+ value=st.session_state.question,
190
+ max_chars=100,
191
+ on_change=reset_results,
192
+ label="question",
193
+ label_visibility="hidden",
194
+ )
195
+ col1, col2 = st.columns(2)
196
+ col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
197
+ col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
198
+
199
+ # Run button
200
+ run_pressed = col1.button("Run")
201
+ if run_pressed:
202
+
203
+ run_query = (
204
+ run_pressed or question != st.session_state.question
205
+ )
206
+ # Get results for query
207
+ if run_query and question:
208
+ reset_results()
209
+ st.session_state.question = question
210
+
211
+ with st.spinner(
212
+ "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
213
+ ):
214
+ try:
215
+ st.session_state.results = query(
216
+ pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
217
+ )
218
+ except JSONDecodeError as je:
219
+ st.error("πŸ‘“ &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
220
+ except Exception as e:
221
+ logging.exception(e)
222
+ if "The server is busy processing requests" in str(e) or "503" in str(e):
223
+ st.error("πŸ§‘β€πŸŒΎ &nbsp;&nbsp; All our workers are busy! Try again later.")
224
+ else:
225
+ st.error(f"🐞 &nbsp;&nbsp; An error occurred during the request. {str(e)}")
226
+
227
+
228
+ if st.session_state.results:
229
+
230
+ st.write("## Results:")
231
+
232
+ for count, result in enumerate(st.session_state.results['answers']):
233
+ answer, context = result.answer, result.context
234
+ start_idx = context.find(answer)
235
+ end_idx = start_idx + len(answer)
236
+ source = f"[{result.meta['Title']}]({result.meta['link']})"
237
+ # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
238
+ st.write(
239
+ markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
240
+ unsafe_allow_html=True,
241
+ )
pinecorn.haystack-pipeline.yml.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
2
+
3
+ version: ignore
4
+
5
+ components: # define all the building-blocks for Pipeline
6
+ - name: DocumentStore
7
+ type: ElasticsearchDocumentStore
8
+ params:
9
+ index=: qa_demo
10
+ similarity: cosine
11
+ embedding_dim: 768
12
+ - name: Retriever
13
+ type: BM25Retriever
14
+ params:
15
+ document_store: DocumentStore # params can reference other components defined in the YAML
16
+ top_k: 5
17
+ - name: Reader # custom-name for the component; helpful for visualization & debugging
18
+ type: FARMReader # Haystack Class name for the component
19
+ params:
20
+ model_name_or_path: deepset/roberta-base-squad2
21
+ context_window_size: 500
22
+ return_no_answer: true
23
+ - name: TextFileConverter
24
+ type: TextConverter
25
+ - name: PDFFileConverter
26
+ type: PDFToTextConverter
27
+ - name: Preprocessor
28
+ type: PreProcessor
29
+ params:
30
+ split_by: word
31
+ split_length: 1000
32
+ - name: FileTypeClassifier
33
+ type: FileTypeClassifier
34
+
35
+ pipelines:
36
+ - name: query # a sample extractive-qa Pipeline
37
+ nodes:
38
+ - name: Retriever
39
+ inputs: [Query]
40
+ - name: Reader
41
+ inputs: [Retriever]
42
+ - name: indexing
43
+ nodes:
44
+ - name: FileTypeClassifier
45
+ inputs: [File]
46
+ - name: TextFileConverter
47
+ inputs: [FileTypeClassifier.output_1]
48
+ - name: PDFFileConverter
49
+ inputs: [FileTypeClassifier.output_2]
50
+ - name: Preprocessor
51
+ inputs: [PDFFileConverter, TextFileConverter]
52
+ - name: Retriever
53
+ inputs: [Preprocessor]
54
+ - name: DocumentStore
55
+ inputs: [Retriever]
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- protobuf==3.19
2
- streamlit==1.13
3
- st-annotated-text
4
- farm-haystack[pinecone]
5
- pinecone-client
6
- datasets
7
  tensorboard
 
1
+ protobuf==3.19
2
+ streamlit==1.13
3
+ st-annotated-text
4
+ farm-haystack[pinecone]
5
+ pinecone-client
6
+ datasets
7
  tensorboard