Spaces:
Runtime error
Runtime error
first commit
Browse files- README.md +30 -6
- app.py +93 -0
- packages.txt +2 -0
- requirements.txt +7 -0
README.md
CHANGED
@@ -1,13 +1,37 @@
|
|
1 |
---
|
2 |
-
title: Voice
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.0.12
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Voice Queries on Financial Documents
|
3 |
+
emoji: π
|
4 |
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
pinned: false
|
|
|
9 |
---
|
10 |
|
11 |
+
# Configuration
|
12 |
+
|
13 |
+
`title`: _string_
|
14 |
+
Display title for the Space
|
15 |
+
|
16 |
+
`emoji`: _string_
|
17 |
+
Space emoji (emoji-only character allowed)
|
18 |
+
|
19 |
+
`colorFrom`: _string_
|
20 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
21 |
+
|
22 |
+
`colorTo`: _string_
|
23 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
24 |
+
|
25 |
+
`sdk`: _string_
|
26 |
+
Can be either `gradio` or `streamlit`
|
27 |
+
|
28 |
+
`sdk_version` : _string_
|
29 |
+
Only applicable for `streamlit` SDK.
|
30 |
+
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
31 |
+
|
32 |
+
`app_file`: _string_
|
33 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
34 |
+
Path is relative to the root of the repository.
|
35 |
+
|
36 |
+
`pinned`: _boolean_
|
37 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
from sentence_transformers import SentenceTransformer, util
|
6 |
+
from transformers import pipeline, Wav2Vec2ProcessorWithLM
|
7 |
+
from librosa import load, resample
|
8 |
+
|
9 |
+
# Constants
|
10 |
+
|
11 |
+
model_name = 'sentence-transformers/msmarco-distilbert-base-v4'
|
12 |
+
max_sequence_length = 512
|
13 |
+
|
14 |
+
# Load corpus
|
15 |
+
import subprocess
|
16 |
+
subprocess.run(["gdown", "1QVpyk_xyqNYrHT3NdUfBxbDV_eyCDa2Q"])
|
17 |
+
with open("embeddings.pkl", "rb") as fp:
|
18 |
+
pickled_data = pickle.load(fp)
|
19 |
+
sentences = pickled_data['sentences']
|
20 |
+
corpus_embeddings = pickled_data['embeddings']
|
21 |
+
|
22 |
+
print(f'Number of documents: {len(sentences)}')
|
23 |
+
|
24 |
+
|
25 |
+
# Load pre-embedded corpus
|
26 |
+
print(f'Number of embeddings: {corpus_embeddings.shape[0]}')
|
27 |
+
|
28 |
+
# Load embedding model
|
29 |
+
model = SentenceTransformer(model_name)
|
30 |
+
model.max_seq_length = max_sequence_length
|
31 |
+
|
32 |
+
# Load speech to text model
|
33 |
+
asr_model = "patrickvonplaten/wav2vec2-base-960h-4-gram"
|
34 |
+
processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
|
35 |
+
asr = pipeline(
|
36 |
+
"automatic-speech-recognition",
|
37 |
+
model=asr_model,
|
38 |
+
tokenizer=processor.tokenizer,
|
39 |
+
feature_extractor=processor.feature_extractor,
|
40 |
+
decoder=processor.decoder,
|
41 |
+
)
|
42 |
+
|
43 |
+
def find_sentences(query, hits):
|
44 |
+
query_embedding = model.encode(query)
|
45 |
+
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=hits)
|
46 |
+
hits = hits[0]
|
47 |
+
|
48 |
+
output_texts = []
|
49 |
+
output_scores = []
|
50 |
+
|
51 |
+
for hit in hits:
|
52 |
+
# Find source document based on sentence index
|
53 |
+
output_texts.append(sentences[hit['corpus_id']])
|
54 |
+
output_scores.append(hit['score'])
|
55 |
+
|
56 |
+
return pd.DataFrame(data={"Text": output_texts, "Score": output_scores})
|
57 |
+
|
58 |
+
|
59 |
+
def process(input_selection, query, filepath, hits):
|
60 |
+
if input_selection=='speech':
|
61 |
+
speech, sampling_rate = load(filepath)
|
62 |
+
if sampling_rate != 16000:
|
63 |
+
speech = resample(speech, sampling_rate, 16000)
|
64 |
+
text = asr(speech)['text']
|
65 |
+
else:
|
66 |
+
text = query
|
67 |
+
return text, find_sentences(text, hits)
|
68 |
+
|
69 |
+
# Gradio inputs
|
70 |
+
buttons = gr.inputs.Radio(['text','speech'], type='value', default='speech', label='Input selection')
|
71 |
+
text_query = gr.inputs.Textbox(lines=1, label='Text input', default='breast cancer biomarkers')
|
72 |
+
mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=True)
|
73 |
+
slider = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=3, label='Number of hits')
|
74 |
+
|
75 |
+
# Gradio outputs
|
76 |
+
speech_query = gr.outputs.Textbox(type='auto', label='Query string')
|
77 |
+
results = gr.outputs.Dataframe(
|
78 |
+
headers=['Text', 'Score'],
|
79 |
+
label='Query results')
|
80 |
+
|
81 |
+
iface = gr.Interface(
|
82 |
+
theme='huggingface',
|
83 |
+
description='This Space lets you query a text corpus containing 50,000 random clinical trial descriptions',
|
84 |
+
fn=process,
|
85 |
+
layout='horizontal',
|
86 |
+
inputs=[buttons,text_query,mic,slider],
|
87 |
+
outputs=[speech_query, results],
|
88 |
+
examples=[
|
89 |
+
['text', "breast cancer biomarkers", 'dummy.wav', 3],
|
90 |
+
],
|
91 |
+
allow_flagging=False
|
92 |
+
)
|
93 |
+
iface.launch()
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
libsndfile1
|
2 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
pandas
|
4 |
+
numpy
|
5 |
+
sentence-transformers
|
6 |
+
librosa
|
7 |
+
gdown
|