kresnik commited on
Commit
10b9192
1 Parent(s): 2d16c99

add application file

Browse files
Files changed (3) hide show
  1. app.py +116 -0
  2. packages.txt +2 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ import gradio as gr
4
+ from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
5
+
6
+ logging.basicConfig(
7
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
8
+ datefmt="%m/%d/%Y %H:%M:%S",
9
+ handlers=[logging.StreamHandler(sys.stdout)],
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.DEBUG)
13
+
14
+
15
+ LARGE_MODEL_BY_LANGUAGE = {
16
+ "Korean": {"model_id": "kresnik/wav2vec2-large-xlsr-korean", "has_lm": True},
17
+ }
18
+
19
+
20
+ # LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
21
+
22
+ # the container given by HF has 16GB of RAM, so we need to limit the number of models to load
23
+ LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
24
+ CACHED_MODELS_BY_ID = {}
25
+
26
+
27
+ def run(input_file, language, decoding_type, history, model_size="300M"):
28
+
29
+ logger.info(f"Running ASR {language}-{model_size}-{decoding_type} for {input_file}")
30
+
31
+ history = history or []
32
+
33
+ if model_size == "300M":
34
+ model = LARGE_MODEL_BY_LANGUAGE.get(language, None)
35
+ else:
36
+ model = XLARGE_MODEL_BY_LANGUAGE.get(language, None)
37
+
38
+ if model is None:
39
+ history.append({
40
+ "error_message": f"Model size {model_size} not found for {language} language :("
41
+ })
42
+ elif decoding_type == "LM" and not model["has_lm"]:
43
+ history.append({
44
+ "error_message": f"LM not available for {language} language :("
45
+ })
46
+ else:
47
+
48
+ # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
49
+ model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
50
+ if model_instance is None:
51
+ model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
52
+ CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
53
+
54
+ if decoding_type == "LM":
55
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
56
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
57
+ feature_extractor=processor.feature_extractor, decoder=processor.decoder)
58
+ else:
59
+ processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
60
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
61
+ feature_extractor=processor.feature_extractor, decoder=None)
62
+
63
+ transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
64
+
65
+ logger.info(f"Transcription for {input_file}: {transcription}")
66
+
67
+ history.append({
68
+ "model_id": model["model_id"],
69
+ "language": language,
70
+ "model_size": model_size,
71
+ "decoding_type": decoding_type,
72
+ "transcription": transcription,
73
+ "error_message": None
74
+ })
75
+
76
+ html_output = "<div class='result'>"
77
+ for item in history:
78
+ if item["error_message"] is not None:
79
+ html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
80
+ else:
81
+ url_suffix = " + LM" if item["decoding_type"] == "LM" else ""
82
+ html_output += "<div class='result_item result_item_success'>"
83
+ html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
84
+ html_output += f'{item["transcription"]}<br/>'
85
+ html_output += "</div>"
86
+ html_output += "</div>"
87
+
88
+ return html_output, history
89
+
90
+
91
+ gr.Interface(
92
+ run,
93
+ inputs=[
94
+ gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
95
+ gr.inputs.Radio(label="Language", choices=LANGUAGES),
96
+ gr.inputs.Radio(label="Decoding type", choices=["greedy"]),
97
+ # gr.inputs.Radio(label="Model size", choices=["300M", "1B"]),
98
+ "state"
99
+ ],
100
+ outputs=[
101
+ gr.outputs.HTML(label="Outputs"),
102
+ "state"
103
+ ],
104
+ title="Automatic Speech Recognition",
105
+ description="",
106
+ css="""
107
+ .result {display:flex;flex-direction:column}
108
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
109
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
110
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
111
+ """,
112
+ allow_screenshot=False,
113
+ allow_flagging="never",
114
+ theme="grass"
115
+ ).launch(enable_queue=True)
116
+
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ ffmpeg
2
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ pyctcdecode
4
+ pypi-kenlm
5
+