patrickvonplaten commited on
Commit
e5d93c0
β€’
1 Parent(s): fbbd41b
Files changed (4) hide show
  1. README.md +4 -4
  2. app.py +81 -0
  3. packages.txt +2 -0
  4. requirements.txt +5 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: XLS R 2B 22 16
3
- emoji: πŸ’»
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: false
 
1
  ---
2
+ title: XLS R 300m EN 15
3
+ emoji: πŸ“Š
4
+ colorFrom: gray
5
+ colorTo: red
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: false
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
4
+
5
+ model_name = "facebook/wav2vec2-xls-r-300m-22-to-16",
6
+
7
+ feature_extractor = AutoFeatureExtractor.from_pretrained(model_name, use_auth_token="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")
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token="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", use_fast=False)
9
+ model = SpeechEncoderDecoderModel.from_pretrained(model_name, use_auth_token="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")
10
+
11
+ def process_audio_file(file):
12
+ data, sr = librosa.load(file)
13
+ if sr != 16000:
14
+ data = librosa.resample(data, sr, 16000)
15
+ print(data.shape)
16
+ input_values = feature_extractor(data, return_tensors="pt").input_values
17
+ return input_values
18
+
19
+ def transcribe(file, target_language):
20
+
21
+ target_code = target_language.split("(")[-1].split(")")[0]
22
+ forced_bos_token_id = MAPPING[target_code]
23
+
24
+ input_values = process_audio_file(file)
25
+
26
+ sequences = model.generate(input_values, forced_bos_token_id=forced_bos_token_id)
27
+
28
+ transcription = tokenizer.batch_decode(sequences, skip_special_tokens=True)
29
+ return transcription[0]
30
+
31
+ target_language = [
32
+ "English (en)",
33
+ "German (de)",
34
+ "Turkish (tr)",
35
+ "Persian (fa)",
36
+ "Swedish (sv)",
37
+ "Mongolian (mn)",
38
+ "Chinese (zh)",
39
+ "Welsh (cy)",
40
+ "Catalan (ca)",
41
+ "Slovenian (sl)",
42
+ "Estonian (et)",
43
+ "Indonesian (id)",
44
+ "Arabic (ar)",
45
+ "Tamil (ta)",
46
+ "Latvian (lv)",
47
+ "Japanese (ja)",
48
+ ]
49
+
50
+ MAPPING = {
51
+ "en": 250004,
52
+ "de": 250003,
53
+ "tr": 250023,
54
+ "fa": 250029,
55
+ "sv": 250042,
56
+ "mn": 250037,
57
+ "zh": 250025,
58
+ "cy": 250007,
59
+ "ca": 250005,
60
+ "sl": 250052,
61
+ "et": 250006,
62
+ "id": 250032,
63
+ "ar": 250001,
64
+ "ta": 250044,
65
+ "lv": 250017,
66
+ "ja": 250012,
67
+ }
68
+
69
+ iface = gr.Interface(
70
+ fn=transcribe,
71
+ inputs=[
72
+ gr.inputs.Audio(source="microphone", type='filepath'),
73
+ gr.inputs.Dropdown(target_language),
74
+ ],
75
+ outputs="text",
76
+ layout="horizontal",
77
+ theme="huggingface",
78
+ title="XLS-R 300M 22-to-16 Speech Translation",
79
+ description="A simple interface to translate from 22 input spoken languages to 16 written languages.",
80
+ )
81
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ sox
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ SoundFile==0.9.0.post1
2
+ librosa
3
+ sentencepiece
4
+ torch
5
+ git+git://github.com/huggingface/transformers