Spaces:
Runtime error
Runtime error
update
Browse files
app.py
CHANGED
@@ -1,104 +1,28 @@
|
|
1 |
-
import logging
|
2 |
-
import sys
|
3 |
import gradio as gr
|
4 |
from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
|
5 |
|
6 |
-
logging.basicConfig(
|
7 |
-
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
8 |
-
datefmt="%m/%d/%Y %H:%M:%S",
|
9 |
-
handlers=[logging.StreamHandler(sys.stdout)],
|
10 |
-
)
|
11 |
-
logger = logging.getLogger(__name__)
|
12 |
-
logger.setLevel(logging.DEBUG)
|
13 |
|
14 |
|
15 |
LARGE_MODEL_BY_LANGUAGE = {
|
16 |
"Korean": {"model_id": "kresnik/wav2vec2-large-xlsr-korean", "has_lm": True},
|
17 |
}
|
18 |
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
def run(input_file, language, decoding_type, history, model_size="300M"):
|
28 |
-
|
29 |
-
logger.info(f"Running ASR {language}-{model_size}-{decoding_type} for {input_file}")
|
30 |
-
|
31 |
-
history = history or []
|
32 |
-
|
33 |
-
if model_size == "300M":
|
34 |
-
model = LARGE_MODEL_BY_LANGUAGE.get(language, None)
|
35 |
-
else:
|
36 |
-
model = XLARGE_MODEL_BY_LANGUAGE.get(language, None)
|
37 |
-
|
38 |
-
if model is None:
|
39 |
-
history.append({
|
40 |
-
"error_message": f"Model size {model_size} not found for {language} language :("
|
41 |
-
})
|
42 |
-
elif decoding_type == "LM" and not model["has_lm"]:
|
43 |
-
history.append({
|
44 |
-
"error_message": f"LM not available for {language} language :("
|
45 |
-
})
|
46 |
-
else:
|
47 |
-
|
48 |
-
# model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
|
49 |
-
model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
|
50 |
-
if model_instance is None:
|
51 |
-
model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
|
52 |
-
CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
|
53 |
-
|
54 |
-
if decoding_type == "LM":
|
55 |
-
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
|
56 |
-
asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
|
57 |
-
feature_extractor=processor.feature_extractor, decoder=processor.decoder)
|
58 |
-
else:
|
59 |
-
processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
|
60 |
-
asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
|
61 |
-
feature_extractor=processor.feature_extractor, decoder=None)
|
62 |
-
|
63 |
-
transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
|
64 |
-
|
65 |
-
logger.info(f"Transcription for {input_file}: {transcription}")
|
66 |
-
|
67 |
-
history.append({
|
68 |
-
"model_id": model["model_id"],
|
69 |
-
"language": language,
|
70 |
-
"model_size": model_size,
|
71 |
-
"decoding_type": decoding_type,
|
72 |
-
"transcription": transcription,
|
73 |
-
"error_message": None
|
74 |
-
})
|
75 |
-
|
76 |
-
html_output = "<div class='result'>"
|
77 |
-
for item in history:
|
78 |
-
if item["error_message"] is not None:
|
79 |
-
html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
|
80 |
-
else:
|
81 |
-
url_suffix = " + LM" if item["decoding_type"] == "LM" else ""
|
82 |
-
html_output += "<div class='result_item result_item_success'>"
|
83 |
-
html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
|
84 |
-
html_output += f'{item["transcription"]}<br/>'
|
85 |
-
html_output += "</div>"
|
86 |
-
html_output += "</div>"
|
87 |
-
|
88 |
-
return html_output, history
|
89 |
-
|
90 |
|
91 |
gr.Interface(
|
92 |
-
|
93 |
inputs=[
|
94 |
gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
|
95 |
-
gr.inputs.Radio(label="Language", choices=LANGUAGES),
|
96 |
-
gr.inputs.Radio(label="Decoding type", choices=["greedy"]),
|
97 |
-
# gr.inputs.Radio(label="Model size", choices=["300M", "1B"]),
|
98 |
"state"
|
99 |
],
|
100 |
outputs=[
|
101 |
-
|
102 |
"state"
|
103 |
],
|
104 |
title="Automatic Speech Recognition",
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
LARGE_MODEL_BY_LANGUAGE = {
|
7 |
"Korean": {"model_id": "kresnik/wav2vec2-large-xlsr-korean", "has_lm": True},
|
8 |
}
|
9 |
|
10 |
+
p=pipeline('kresnik/wav2vec2-large-xlsr-korean')
|
11 |
|
12 |
+
def transcribe(audio, state=""):
|
13 |
+
time.sleep(2)
|
14 |
+
text = p(audio)["text"]
|
15 |
+
state+= text+ " "
|
16 |
+
return state, state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
gr.Interface(
|
19 |
+
fn=transcribe,
|
20 |
inputs=[
|
21 |
gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
|
|
|
|
|
|
|
22 |
"state"
|
23 |
],
|
24 |
outputs=[
|
25 |
+
"textbox",
|
26 |
"state"
|
27 |
],
|
28 |
title="Automatic Speech Recognition",
|