qanastek commited on
Commit
73bf18c
β€’
1 Parent(s): 3b733d8

Add multi langs

Browse files
app.py CHANGED
@@ -6,10 +6,27 @@ import librosa
6
  from glob import glob
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
8
 
9
- # ASR
10
- model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
11
- processor_asr = Wav2Vec2Processor.from_pretrained(model_name)
12
- model_asr = Wav2Vec2ForCTC.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Classifier Intent
15
  model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
@@ -29,13 +46,23 @@ tokenizer_ner = AutoTokenizer.from_pretrained(model_name)
29
  model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
30
  predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)
31
 
32
- EXAMPLE_DIR = './'
33
  examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
 
34
 
35
- def transcribe(audio_path):
36
 
37
  speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)
38
 
 
 
 
 
 
 
 
 
 
39
  inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
40
 
41
  with torch.no_grad():
@@ -66,9 +93,9 @@ def getUniform(text):
66
  return res
67
 
68
 
69
- def process(path):
70
 
71
- text = transcribe(path).replace("apizza","a pizza")
72
 
73
  intent_class = classifier_intent(text)[0]["label"]
74
  language_class = classifier_language(text)[0]["label"]
@@ -81,18 +108,13 @@ def process(path):
81
  "named_entities": named_entities,
82
  }
83
 
84
- def predict(wav_file):
85
- res = process(wav_file)
86
- return res
87
-
88
- # iface = gr.Interface(fn=predict, inputs="text", outputs="text")
89
-
90
  iface = gr.Interface(
91
  predict,
92
  title='Alexa NLU Clone',
93
  description='Upload your wav file to test the models',
94
  inputs=[
95
- gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
 
96
  ],
97
  outputs=[
98
  gr.outputs.JSON(label='Slot Recognition + Intent Classification + Language Classification + ASR'),
 
6
  from glob import glob
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
8
 
9
+ SAMPLE_RATE = 16_000
10
+
11
+ models = {}
12
+
13
+ models_names = {
14
+ "en-US": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
15
+ "fr-FR": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
16
+ "nl-NL": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch",
17
+ "pl-PL": "jonatasgrosman/wav2vec2-large-xlsr-53-polish",
18
+ "it-IT": "jonatasgrosman/wav2vec2-large-xlsr-53-italian",
19
+ "ru-RU": "jonatasgrosman/wav2vec2-large-xlsr-53-russian",
20
+ "pt-PT": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese",
21
+ "de-DE": "jonatasgrosman/wav2vec2-large-xlsr-53-german",
22
+ "es-ES": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
23
+ "ja-JP": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese",
24
+ "ar-SA": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
25
+ "fi-FI": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish",
26
+ "hu-HU": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
27
+ "zh-CN": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
28
+ "el-GR": "jonatasgrosman/wav2vec2-large-xlsr-53-greek",
29
+ }
30
 
31
  # Classifier Intent
32
  model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
 
46
  model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
47
  predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)
48
 
49
+ EXAMPLE_DIR = './wavs/'
50
  examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
51
+ examples = [[e.split("=")[1], e.split("=")[0]] for e in examples]
52
 
53
+ def transcribe(audio_path, lang_code):
54
 
55
  speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)
56
 
57
+ if lang_code not in models:
58
+ models[lang_code] = {}
59
+ models[lang_code]["processor"] = Wav2Vec2Processor.from_pretrained(models_names[lang_code])
60
+ models[lang_code]["model"] = Wav2Vec2ForCTC.from_pretrained(models_names[lang_code])
61
+
62
+ # Load model
63
+ processor_asr = models[lang_code]["processor"]
64
+ model_asr = models[lang_code]["model"]
65
+
66
  inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
67
 
68
  with torch.no_grad():
 
93
  return res
94
 
95
 
96
+ def predict(wav_file, lang_code):
97
 
98
+ text = transcribe(wav_file, lang_code).replace("apizza","a pizza")
99
 
100
  intent_class = classifier_intent(text)[0]["label"]
101
  language_class = classifier_language(text)[0]["label"]
 
108
  "named_entities": named_entities,
109
  }
110
 
 
 
 
 
 
 
111
  iface = gr.Interface(
112
  predict,
113
  title='Alexa NLU Clone',
114
  description='Upload your wav file to test the models',
115
  inputs=[
116
+ gr.inputs.Audio(label='wav file', source='microphone', type='filepath'),
117
+ gr.Dropdown(list(models_names.keys())),
118
  ],
119
  outputs=[
120
  gr.outputs.JSON(label='Slot Recognition + Intent Classification + Language Classification + ASR'),
order-me-a-pizza.wav β†’ wavs/en_US=order-me-a-pizza.wav RENAMED
File without changes
set-the-volume-to-low.wav β†’ wavs/en_US=set-the-volume-to-low.wav RENAMED
File without changes
tell-me-a-good-joke.wav β†’ wavs/en_US=tell-me-a-good-joke.wav RENAMED
File without changes
tell-me-the-artist-of-this-song.wav β†’ wavs/en_US=tell-me-the-artist-of-this-song.wav RENAMED
File without changes
wavs/es_ES=poner-una-alarma-a-las-doce.wav ADDED
Binary file (70.6 kB). View file