kresnik commited on
Commit
14855d6
1 Parent(s): 1b1ef7b
Files changed (1) hide show
  1. app.py +8 -84
app.py CHANGED
@@ -1,104 +1,28 @@
1
- import logging
2
- import sys
3
  import gradio as gr
4
  from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
5
 
6
- logging.basicConfig(
7
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
8
- datefmt="%m/%d/%Y %H:%M:%S",
9
- handlers=[logging.StreamHandler(sys.stdout)],
10
- )
11
- logger = logging.getLogger(__name__)
12
- logger.setLevel(logging.DEBUG)
13
 
14
 
15
  LARGE_MODEL_BY_LANGUAGE = {
16
  "Korean": {"model_id": "kresnik/wav2vec2-large-xlsr-korean", "has_lm": True},
17
  }
18
 
 
19
 
20
- # LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
21
-
22
- # the container given by HF has 16GB of RAM, so we need to limit the number of models to load
23
- LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
24
- CACHED_MODELS_BY_ID = {}
25
-
26
-
27
- def run(input_file, language, decoding_type, history, model_size="300M"):
28
-
29
- logger.info(f"Running ASR {language}-{model_size}-{decoding_type} for {input_file}")
30
-
31
- history = history or []
32
-
33
- if model_size == "300M":
34
- model = LARGE_MODEL_BY_LANGUAGE.get(language, None)
35
- else:
36
- model = XLARGE_MODEL_BY_LANGUAGE.get(language, None)
37
-
38
- if model is None:
39
- history.append({
40
- "error_message": f"Model size {model_size} not found for {language} language :("
41
- })
42
- elif decoding_type == "LM" and not model["has_lm"]:
43
- history.append({
44
- "error_message": f"LM not available for {language} language :("
45
- })
46
- else:
47
-
48
- # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
49
- model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
50
- if model_instance is None:
51
- model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
52
- CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
53
-
54
- if decoding_type == "LM":
55
- processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
56
- asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
57
- feature_extractor=processor.feature_extractor, decoder=processor.decoder)
58
- else:
59
- processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
60
- asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
61
- feature_extractor=processor.feature_extractor, decoder=None)
62
-
63
- transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
64
-
65
- logger.info(f"Transcription for {input_file}: {transcription}")
66
-
67
- history.append({
68
- "model_id": model["model_id"],
69
- "language": language,
70
- "model_size": model_size,
71
- "decoding_type": decoding_type,
72
- "transcription": transcription,
73
- "error_message": None
74
- })
75
-
76
- html_output = "<div class='result'>"
77
- for item in history:
78
- if item["error_message"] is not None:
79
- html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
80
- else:
81
- url_suffix = " + LM" if item["decoding_type"] == "LM" else ""
82
- html_output += "<div class='result_item result_item_success'>"
83
- html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
84
- html_output += f'{item["transcription"]}<br/>'
85
- html_output += "</div>"
86
- html_output += "</div>"
87
-
88
- return html_output, history
89
-
90
 
91
  gr.Interface(
92
- run,
93
  inputs=[
94
  gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
95
- gr.inputs.Radio(label="Language", choices=LANGUAGES),
96
- gr.inputs.Radio(label="Decoding type", choices=["greedy"]),
97
- # gr.inputs.Radio(label="Model size", choices=["300M", "1B"]),
98
  "state"
99
  ],
100
  outputs=[
101
- gr.outputs.HTML(label="Outputs"),
102
  "state"
103
  ],
104
  title="Automatic Speech Recognition",
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
3
 
 
 
 
 
 
 
 
4
 
5
 
6
  LARGE_MODEL_BY_LANGUAGE = {
7
  "Korean": {"model_id": "kresnik/wav2vec2-large-xlsr-korean", "has_lm": True},
8
  }
9
 
10
+ p=pipeline('kresnik/wav2vec2-large-xlsr-korean')
11
 
12
+ def transcribe(audio, state=""):
13
+ time.sleep(2)
14
+ text = p(audio)["text"]
15
+ state+= text+ " "
16
+ return state, state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  gr.Interface(
19
+ fn=transcribe,
20
  inputs=[
21
  gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
 
 
 
22
  "state"
23
  ],
24
  outputs=[
25
+ "textbox",
26
  "state"
27
  ],
28
  title="Automatic Speech Recognition",