iloncka commited on
Commit
d5384bf
1 Parent(s): 7d6f684

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -69
app.py CHANGED
@@ -15,19 +15,24 @@ from streamlit_lottie import st_lottie
15
  st.set_page_config(page_title="QA-project", page_icon="📇")
16
  os.environ['TOKENIZERS_PARALLELISM'] = "false"
17
  DATA_DIR = './dataset'
 
18
  DOCS_PATH = os.path.join(DATA_DIR, 'all_docs_36838.pkl')
19
  LOTTIE_PATH = './img/108423-search-for-documents.json'
20
- PROG_TITLE = "QA project Demo"
 
 
 
 
21
  # Adjust to a question that you would like users to see in the search bar when they load the UI:
22
  DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "Что делает Домашняя бухгалтерия?")
23
  DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "Домашняя бухгалтерия позволяет вести счета в разных валютах")
24
 
25
- def place_header_center(text, lottie_data):
26
- img, title= st.columns([1,3])
27
- with img:
28
- st_lottie(lottie_data, height=150)
29
- with title:
30
- st.title(text)
31
 
32
 
33
  @st.experimental_memo
@@ -41,49 +46,23 @@ def load_and_write_data(document_store):
41
 
42
  with open(DOCS_PATH, "rb") as f:
43
  docs = dill.load(f)
44
-
45
  document_store.write_documents(docs)
46
 
47
 
48
- def get_backlink(result):
49
- if result.get("document", None):
50
- doc = result["document"]
51
- if isinstance(doc, dict):
52
- if doc.get("meta", None):
53
- if isinstance(doc["meta"], dict):
54
- if doc["meta"].get("url", None):
55
- return doc["meta"]["url"]
56
- return None
57
-
58
-
59
- def get_doc_name(result):
60
- if result.get("document", None):
61
- doc = result["document"]
62
- if isinstance(doc, dict):
63
- if doc.get("meta", None):
64
- if isinstance(doc["meta"], dict):
65
- if doc["meta"].get("name", None):
66
- return doc["meta"]["name"]
67
- return None
68
-
69
  def get_doc_reg_id(result):
70
- if result.get("document", None):
71
- doc = result["document"]
72
- if isinstance(doc, dict):
73
- if doc.get("meta", None):
74
- if isinstance(doc["meta"], dict):
75
- if doc["meta"].get("reg_id", None):
76
- return doc["meta"]["reg_id"]
77
  return None
 
 
78
  # Haystack Components
79
- # @st.cache(allow_output_mutation=True)
80
- # def start_haystack():
81
  document_store = InMemoryDocumentStore() # use_bm25=True
82
  load_and_write_data(document_store)
83
  retriever = TfidfRetriever(document_store=document_store)
84
  reader = FARMReader(model_name_or_path="DeepPavlov/rubert-base-cased-sentence",
85
- use_gpu=False,
86
- num_processes=1)
87
  pipeline = ExtractiveQAPipeline(reader, retriever)
88
 
89
 
@@ -101,16 +80,23 @@ def reset_results(*args):
101
 
102
  # Streamlit App
103
  lottie_data = get_lottie(LOTTIE_PATH)
104
- place_header_center(PROG_TITLE, lottie_data)
 
 
 
 
 
 
105
 
106
  st.markdown("""
107
- This QA demo uses a [Haystack Extractive QA Pipeline](https://haystack.deepset.ai/components/ready-made-pipelines#extractiveqapipeline) with
108
- an [InMemoryDocumentStore](https://haystack.deepset.ai/components/document-store) which contains documents about different program modules
109
- Go ahead and ask questions about the program modules functionality!
 
110
  """, unsafe_allow_html=True)
111
 
112
  question = st.text_input("", value=st.session_state.question, max_chars=100, on_change=reset_results)
113
-
114
 
115
  def ask_question(question):
116
  prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
@@ -119,13 +105,16 @@ def ask_question(question):
119
  for answer in answers:
120
  answer = answer.to_dict()
121
  if answer.get("answer", None):
 
122
  results.append(
123
  {
124
  "context": "..." + answer["context"] + "...",
125
  "answer": answer.get("answer", None),
126
- "source": answer["meta"]["name"],
127
  "relevance": round(answer["score"] * 100, 2),
128
- "document": [doc for doc in response["documents"] if doc["id"] == answer["document_id"]][0],
 
 
129
  "offset_start_in_doc": answer["offsets_in_document"][0]["start"],
130
  "_raw": answer,
131
  }
@@ -144,7 +133,7 @@ def ask_question(question):
144
 
145
 
146
  if question:
147
- with st.spinner("🕰️    Performing semantic search on program modules..."):
148
  try:
149
  msg = 'Asked ' + question
150
  logging.info(msg)
@@ -154,34 +143,31 @@ if question:
154
 
155
 
156
  if st.session_state.results:
157
- st.write('## Top Results')
158
  for count, result in enumerate(st.session_state.results):
159
  if result["answer"]:
160
- answer, context = result["answer"], result["context"]
161
- start_idx = context.find(answer)
162
- end_idx = start_idx + len(answer)
 
 
 
 
 
 
 
 
 
 
163
  st.write(
164
- markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#ff700f", color='#ffffff')) + context[end_idx:]),
165
  unsafe_allow_html=True,
166
  )
167
- source = ""
168
- url = get_backlink(result)
169
- name = get_doc_name(result)
170
- reg_id = get_doc_reg_id(result)
171
- if name:
172
- source += f"[{result['document']['meta']['name']}]"
173
-
174
- if url:
175
- source += f"({result['document']['meta']['url']})"
176
 
177
- if reg_id:
178
- source += f"({result['document']['meta']['reg_id']})"
179
- if source:
180
- st.markdown(f"**Relevance:** {result['relevance']} - **Source:** {source}")
181
- else:
182
- st.markdown(f"**Relevance:** {result['relevance']}")
183
 
184
  else:
185
  st.info(
186
- "🤔    Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
187
  )
 
15
  st.set_page_config(page_title="QA-project", page_icon="📇")
16
  os.environ['TOKENIZERS_PARALLELISM'] = "false"
17
  DATA_DIR = './dataset'
18
+ NAMES_DICT_PATH = 'mod_names_dict.pkl'
19
  DOCS_PATH = os.path.join(DATA_DIR, 'all_docs_36838.pkl')
20
  LOTTIE_PATH = './img/108423-search-for-documents.json'
21
+ PROG_TITLE = "Научные кейсы"
22
+ PROG_SUBTITLE = "Рекомендации по существующим в компании компонентам цифровых продуктов для решения новых бизнес-задач"
23
+
24
+
25
+
26
  # Adjust to a question that you would like users to see in the search bar when they load the UI:
27
  DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "Что делает Домашняя бухгалтерия?")
28
  DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "Домашняя бухгалтерия позволяет вести счета в разных валютах")
29
 
30
+
31
+ @st.experimental_memo
32
+ def load_dict(path):
33
+ with open(path, "rb") as f:
34
+ loaded = dill.load(f)
35
+ return loaded
36
 
37
 
38
  @st.experimental_memo
 
46
 
47
  with open(DOCS_PATH, "rb") as f:
48
  docs = dill.load(f)
49
+
50
  document_store.write_documents(docs)
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_doc_reg_id(result):
54
+ if result.get("reg_id", None):
55
+ reg_id = result["reg_id"]
56
+ return reg_id
 
 
 
 
57
  return None
58
+
59
+
60
  # Haystack Components
 
 
61
  document_store = InMemoryDocumentStore() # use_bm25=True
62
  load_and_write_data(document_store)
63
  retriever = TfidfRetriever(document_store=document_store)
64
  reader = FARMReader(model_name_or_path="DeepPavlov/rubert-base-cased-sentence",
65
+ use_gpu=False)
 
66
  pipeline = ExtractiveQAPipeline(reader, retriever)
67
 
68
 
 
80
 
81
  # Streamlit App
82
  lottie_data = get_lottie(LOTTIE_PATH)
83
+ img, title= st.columns([2,3])
84
+ with img:
85
+ st_lottie(lottie_data) #, height=350
86
+ with title:
87
+ st.title(PROG_TITLE)
88
+ st.subheader(PROG_SUBTITLE)
89
+
90
 
91
  st.markdown("""
92
+ Это демонстрационная версия сервиса поисковой системы программных продуктов с использованием технологии
93
+ [Haystack Extractive QA Pipeline](https://haystack.deepset.ai/components/ready-made-pipelines#extractiveqapipeline)
94
+ и [InMemoryDocumentStore](https://haystack.deepset.ai/components/document-store)
95
+ Чтобы испытать сервис можно задавать вопросы в свободной форме по функционалу программных продуктов.
96
  """, unsafe_allow_html=True)
97
 
98
  question = st.text_input("", value=st.session_state.question, max_chars=100, on_change=reset_results)
99
+ mod_names_dict = load_dict(NAMES_DICT_PATH)
100
 
101
  def ask_question(question):
102
  prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
 
105
  for answer in answers:
106
  answer = answer.to_dict()
107
  if answer.get("answer", None):
108
+ document = [doc for doc in prediction["documents"] if (doc.to_dict()["id"] == answer["document_id"])][0]
109
  results.append(
110
  {
111
  "context": "..." + answer["context"] + "...",
112
  "answer": answer.get("answer", None),
113
+ "source": answer["meta"]["name"] if answer["meta"].get("name", None) else answer["meta"]['url'],
114
  "relevance": round(answer["score"] * 100, 2),
115
+ "document": document.content,
116
+ "doc_score": document.score,
117
+ "reg_id": document.meta["reg_id"],
118
  "offset_start_in_doc": answer["offsets_in_document"][0]["start"],
119
  "_raw": answer,
120
  }
 
133
 
134
 
135
  if question:
136
+ with st.spinner("🕰️    Производится семантический поиск по информационной базе ..."):
137
  try:
138
  msg = 'Asked ' + question
139
  logging.info(msg)
 
143
 
144
 
145
  if st.session_state.results:
146
+ st.write('## Результаты')
147
  for count, result in enumerate(st.session_state.results):
148
  if result["answer"]:
149
+ answer, context = result["answer"], result["document"]
150
+ start_idx = context.find(result["context"])
151
+ end_idx = start_idx + len(result["context"])
152
+ reg_id = get_doc_reg_id(result)
153
+ module_info = ''
154
+ if reg_id:
155
+ module_name = mod_names_dict.get(reg_id, None)
156
+ if module_name:
157
+ module_info = f"**Наименование модуля/программы: :orange[{module_name}]**"
158
+ else:
159
+ module_info = f"Наименование модуля/программы отсутствует!"
160
+
161
+ st.markdown(f"{module_info} - **Релевантность:** {result['relevance']}")
162
  st.write(
163
+ markdown(context[:start_idx] + str(annotation(body=result["context"], label="ANSWER", background="#ff700f", color='#ffffff')) + context[end_idx:]),
164
  unsafe_allow_html=True,
165
  )
 
 
 
 
 
 
 
 
 
166
 
167
+ st.markdown(f"**Источник:** {result['source']}")
168
+
 
 
 
 
169
 
170
  else:
171
  st.info(
172
+ "🤔    Поисковая система не справилась с Вашим запросом. Попробуйте его переформулировать!"
173
  )