MorenoLQ commited on
Commit
856bef6
β€’
1 Parent(s): e6cb6ec

First commit

Browse files
Files changed (5) hide show
  1. README.md +10 -6
  2. app.py +110 -0
  3. gradio_queue.db +0 -0
  4. packages.txt +1 -0
  5. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,16 @@
1
  ---
2
- title: Robust Asr It
3
- emoji: πŸš€
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: gradio
7
  app_file: app.py
8
- pinned: false
9
  license: mit
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
1
  ---
2
+ title: Italian Robust ASR
3
+ emoji: 🎀
4
+ colorFrom: red
5
+ colorTo: green
6
  sdk: gradio
7
  app_file: app.py
8
+ pinned: true
9
  license: mit
10
  ---
11
 
12
+ # Italian Robust ASR
13
+
14
+ Demo app for testing the model trained during the robust-speech-challenge by πŸ€— HuggingFace
15
+
16
+ Forked by [jonatasgrosman/asr](https://huggingface.co/spaces/jonatasgrosman/asr)
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+ import gradio as gr
4
+ from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
5
+
6
+ logging.basicConfig(
7
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
8
+ datefmt="%m/%d/%Y %H:%M:%S",
9
+ handlers=[logging.StreamHandler(sys.stdout)],
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.DEBUG)
13
+
14
+
15
+ DICT_MODELS = {
16
+ "robust-300m": {"model_id": "dbdmg/wav2vec2-xls-r-300m-italian-robust", "has_lm": True},
17
+ "robust-1b": {"model_id": "dbdmg/wav2vec2-xls-r-1b-italian-robust", "has_lm": True},
18
+ "300m": {"model_id": "dbdmg/wav2vec2-xls-r-300m-italian", "has_lm": True},
19
+ }
20
+
21
+
22
+ # LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys())
23
+
24
+ # the container given by HF has 16GB of RAM, so we need to limit the number of models to load
25
+ MODELS = sorted(DICT_MODELS.keys())
26
+ CACHED_MODELS_BY_ID = {}
27
+
28
+ def run(input_file, model_name, decoding_type, history):
29
+
30
+ logger.info(f"Running ASR {model_name}-{decoding_type} for {input_file}")
31
+
32
+ history = history or []
33
+
34
+ model = DICT_MODELS.get(model_name)
35
+
36
+ if model is None:
37
+ history.append({
38
+ "error_message": f"Model size {model_size} not found for {language} language :("
39
+ })
40
+ elif decoding_type == "Guided by Language Model" and not model["has_lm"]:
41
+ history.append({
42
+ "error_message": f"LM not available for {language} language :("
43
+ })
44
+ else:
45
+
46
+ # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
47
+ model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
48
+ if model_instance is None:
49
+ model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
50
+ CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
51
+
52
+ if decoding_type == "Guided by Language Model":
53
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
54
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
55
+ feature_extractor=processor.feature_extractor, decoder=processor.decoder)
56
+ else:
57
+ processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
58
+ asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
59
+ feature_extractor=processor.feature_extractor, decoder=None)
60
+
61
+ transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
62
+
63
+ logger.info(f"Transcription for {input_file}: {transcription}")
64
+
65
+ history.append({
66
+ "model_id": model["model_id"],
67
+ "decoding_type": decoding_type,
68
+ "transcription": transcription,
69
+ "error_message": None
70
+ })
71
+
72
+ html_output = "<div class='result'>"
73
+ for item in history:
74
+ if item["error_message"] is not None:
75
+ html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>"
76
+ else:
77
+ url_suffix = " + Guided by Language Model" if item["decoding_type"] == "Guided by Language Model" else ""
78
+ html_output += "<div class='result_item result_item_success'>"
79
+ html_output += f'<strong><a target="_blank" href="https://huggingface.co/{item["model_id"]}">{item["model_id"]}{url_suffix}</a></strong><br/><br/>'
80
+ html_output += f'{item["transcription"]}<br/>'
81
+ html_output += "</div>"
82
+ html_output += "</div>"
83
+
84
+ return html_output, history
85
+
86
+
87
+ gr.Interface(
88
+ run,
89
+ inputs=[
90
+ gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."),
91
+ gr.inputs.Radio(label="Model", choices=MODELS),
92
+ gr.inputs.Radio(label="Decoding type", choices=["Standard", "Guided by Language Model"]),
93
+ "state"
94
+ ],
95
+ outputs=[
96
+ gr.outputs.HTML(label="Outputs"),
97
+ "state"
98
+ ],
99
+ title="Italian Robust ASR",
100
+ description="",
101
+ css="""
102
+ .result {display:flex;flex-direction:column}
103
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
104
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
105
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
106
+ """,
107
+ allow_screenshot=False,
108
+ allow_flagging="never",
109
+ theme="huggingface"
110
+ ).launch(enable_queue=True)
gradio_queue.db ADDED
File without changes
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ pyctcdecode
4
+ pypi-kenlm