ugmSorcero commited on
Commit
1d466d7
1 Parent(s): 9ff8b5f

Adds doc 2 speech node to keyword search and plays audio in UI

Browse files
.gitignore CHANGED
@@ -128,4 +128,6 @@ dmypy.json
128
  # Pyre type checker
129
  .pyre/
130
 
131
- .vscode/
 
 
 
128
  # Pyre type checker
129
  .pyre/
130
 
131
+ .vscode/
132
+
133
+ data/audio/
core/pipelines.py CHANGED
@@ -2,14 +2,19 @@
2
  Haystack Pipelines
3
  """
4
 
 
5
  from haystack import Pipeline
6
  from haystack.document_stores import InMemoryDocumentStore
7
  from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
8
  from haystack.nodes.preprocessor import PreProcessor
9
  from haystack.nodes.ranker import SentenceTransformersRanker
 
 
10
 
 
 
11
 
12
- def keyword_search(index="documents", split_word_length=100):
13
  """
14
  **Keyword Search Pipeline**
15
 
@@ -44,6 +49,13 @@ def keyword_search(index="documents", split_word_length=100):
44
  index_pipeline.add_node(
45
  document_store, name="DocumentStore", inputs=["TfidfRetriever"]
46
  )
 
 
 
 
 
 
 
47
 
48
  return search_pipeline, index_pipeline
49
 
 
2
  Haystack Pipelines
3
  """
4
 
5
+ from pathlib import Path
6
  from haystack import Pipeline
7
  from haystack.document_stores import InMemoryDocumentStore
8
  from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
9
  from haystack.nodes.preprocessor import PreProcessor
10
  from haystack.nodes.ranker import SentenceTransformersRanker
11
+ from haystack.nodes.audio.document_to_speech import DocumentToSpeech
12
+ import os
13
 
14
+ data_path = 'data/'
15
+ os.makedirs(data_path, exist_ok=True)
16
 
17
+ def keyword_search(index="documents", split_word_length=100, audio_output=False):
18
  """
19
  **Keyword Search Pipeline**
20
 
 
49
  index_pipeline.add_node(
50
  document_store, name="DocumentStore", inputs=["TfidfRetriever"]
51
  )
52
+
53
+ if audio_output:
54
+ doc2speech = DocumentToSpeech(
55
+ model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
56
+ generated_audio_dir=Path(data_path + 'audio'),
57
+ )
58
+ search_pipeline.add_node(doc2speech, name='DocumentToSpeech', inputs=['TfidfRetriever'])
59
 
60
  return search_pipeline, index_pipeline
61
 
core/search_index.py CHANGED
@@ -32,13 +32,17 @@ def search(queries, pipeline):
32
  for res in matches:
33
  if not score_is_empty:
34
  score_is_empty = True if res.score is None else False
 
 
 
 
 
 
 
 
 
35
  query_results.append(
36
- {
37
- "text": res.content,
38
- "score": res.score,
39
- "id": res.meta["id"],
40
- "fragment_id": res.id,
41
- }
42
  )
43
  if not score_is_empty:
44
  query_results = sorted(
 
32
  for res in matches:
33
  if not score_is_empty:
34
  score_is_empty = True if res.score is None else False
35
+ match = {
36
+ "text": res.content,
37
+ "id": res.meta["id"],
38
+ "fragment_id": res.id,
39
+ }
40
+ if not score_is_empty:
41
+ match.update({'score': res.score})
42
+ if hasattr(res, 'content_audio'):
43
+ match.update({'content_audio': res.content_audio})
44
  query_results.append(
45
+ match
 
 
 
 
 
46
  )
47
  if not score_is_empty:
48
  query_results = sorted(
interface/components.py CHANGED
@@ -59,8 +59,10 @@ def component_show_search_result(container, results):
59
  st.markdown(f"### Match {idx+1}")
60
  st.markdown(f"**Text**: {document['text']}")
61
  st.markdown(f"**Document**: {document['id']}")
62
- if document["score"] is not None:
63
  st.markdown(f"**Score**: {document['score']:.3f}")
 
 
64
  st.markdown("---")
65
 
66
 
 
59
  st.markdown(f"### Match {idx+1}")
60
  st.markdown(f"**Text**: {document['text']}")
61
  st.markdown(f"**Document**: {document['id']}")
62
+ if 'score' in document:
63
  st.markdown(f"**Score**: {document['score']:.3f}")
64
+ if 'content_audio' in document:
65
+ st.audio(str(document['content_audio']))
66
  st.markdown("---")
67
 
68
 
requirements.txt CHANGED
@@ -5,4 +5,8 @@ black==22.8.0
5
  plotly==5.10.0
6
  newspaper3k==0.2.8
7
  PyPDF2==2.10.7
8
- pytesseract==0.3.10
 
 
 
 
 
5
  plotly==5.10.0
6
  newspaper3k==0.2.8
7
  PyPDF2==2.10.7
8
+ pytesseract==0.3.10
9
+ soundfile==0.10.3.post1
10
+ espnet
11
+ pydub==0.25.1
12
+ espnet_model_zoo==0.1.7