nbroad HF staff commited on
Commit
e693db5
β€’
1 Parent(s): abe434f

first commit

Browse files
Files changed (4) hide show
  1. README.md +30 -6
  2. app.py +93 -0
  3. packages.txt +2 -0
  4. requirements.txt +7 -0
README.md CHANGED
@@ -1,13 +1,37 @@
1
  ---
2
- title: Voice Querires Clinical Trials
3
- emoji: πŸ’©
4
  colorFrom: green
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.0.12
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Voice Queries on Financial Documents
3
+ emoji: πŸŽ™
4
  colorFrom: green
5
+ colorTo: yellow
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
 
9
  ---
10
 
11
+ # Configuration
12
+
13
+ `title`: _string_
14
+ Display title for the Space
15
+
16
+ `emoji`: _string_
17
+ Space emoji (emoji-only character allowed)
18
+
19
+ `colorFrom`: _string_
20
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
+
22
+ `colorTo`: _string_
23
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
+
25
+ `sdk`: _string_
26
+ Can be either `gradio` or `streamlit`
27
+
28
+ `sdk_version` : _string_
29
+ Only applicable for `streamlit` SDK.
30
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
+
32
+ `app_file`: _string_
33
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
34
+ Path is relative to the root of the repository.
35
+
36
+ `pinned`: _boolean_
37
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import pandas as pd
3
+ import gradio as gr
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from transformers import pipeline, Wav2Vec2ProcessorWithLM
7
+ from librosa import load, resample
8
+
9
+ # Constants
10
+
11
+ model_name = 'sentence-transformers/msmarco-distilbert-base-v4'
12
+ max_sequence_length = 512
13
+
14
+ # Load corpus
15
+ import subprocess
16
+ subprocess.run(["gdown", "1QVpyk_xyqNYrHT3NdUfBxbDV_eyCDa2Q"])
17
+ with open("embeddings.pkl", "rb") as fp:
18
+ pickled_data = pickle.load(fp)
19
+ sentences = pickled_data['sentences']
20
+ corpus_embeddings = pickled_data['embeddings']
21
+
22
+ print(f'Number of documents: {len(sentences)}')
23
+
24
+
25
+ # Load pre-embedded corpus
26
+ print(f'Number of embeddings: {corpus_embeddings.shape[0]}')
27
+
28
+ # Load embedding model
29
+ model = SentenceTransformer(model_name)
30
+ model.max_seq_length = max_sequence_length
31
+
32
+ # Load speech to text model
33
+ asr_model = "patrickvonplaten/wav2vec2-base-960h-4-gram"
34
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
35
+ asr = pipeline(
36
+ "automatic-speech-recognition",
37
+ model=asr_model,
38
+ tokenizer=processor.tokenizer,
39
+ feature_extractor=processor.feature_extractor,
40
+ decoder=processor.decoder,
41
+ )
42
+
43
+ def find_sentences(query, hits):
44
+ query_embedding = model.encode(query)
45
+ hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=hits)
46
+ hits = hits[0]
47
+
48
+ output_texts = []
49
+ output_scores = []
50
+
51
+ for hit in hits:
52
+ # Find source document based on sentence index
53
+ output_texts.append(sentences[hit['corpus_id']])
54
+ output_scores.append(hit['score'])
55
+
56
+ return pd.DataFrame(data={"Text": output_texts, "Score": output_scores})
57
+
58
+
59
+ def process(input_selection, query, filepath, hits):
60
+ if input_selection=='speech':
61
+ speech, sampling_rate = load(filepath)
62
+ if sampling_rate != 16000:
63
+ speech = resample(speech, sampling_rate, 16000)
64
+ text = asr(speech)['text']
65
+ else:
66
+ text = query
67
+ return text, find_sentences(text, hits)
68
+
69
+ # Gradio inputs
70
+ buttons = gr.inputs.Radio(['text','speech'], type='value', default='speech', label='Input selection')
71
+ text_query = gr.inputs.Textbox(lines=1, label='Text input', default='breast cancer biomarkers')
72
+ mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=True)
73
+ slider = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=3, label='Number of hits')
74
+
75
+ # Gradio outputs
76
+ speech_query = gr.outputs.Textbox(type='auto', label='Query string')
77
+ results = gr.outputs.Dataframe(
78
+ headers=['Text', 'Score'],
79
+ label='Query results')
80
+
81
+ iface = gr.Interface(
82
+ theme='huggingface',
83
+ description='This Space lets you query a text corpus containing 50,000 random clinical trial descriptions',
84
+ fn=process,
85
+ layout='horizontal',
86
+ inputs=[buttons,text_query,mic,slider],
87
+ outputs=[speech_query, results],
88
+ examples=[
89
+ ['text', "breast cancer biomarkers", 'dummy.wav', 3],
90
+ ],
91
+ allow_flagging=False
92
+ )
93
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ pandas
4
+ numpy
5
+ sentence-transformers
6
+ librosa
7
+ gdown