File size: 3,820 Bytes
fb4e25f
 
d66e935
59da3de
 
f9e3936
59da3de
 
0ed6760
 
 
 
 
 
 
 
 
bcf29d2
0ed6760
 
 
 
 
bcf29d2
0ed6760
 
 
 
 
bcf29d2
0ed6760
d66e935
cb25b1b
 
63deeee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bcd824
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138be94
 
43bb9e7
138be94
 
 
 
 
 
 
 
 
 
 
43bb9e7
138be94
 
 
 
fb4e25f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr

import os
import torch
import librosa
from glob import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM

# ASR
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
processor_asr = Wav2Vec2Processor.from_pretrained(model_name)
model_asr = Wav2Vec2ForCTC.from_pretrained(model_name)

# Classifier Intent
model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
tokenizer_intent = AutoTokenizer.from_pretrained(model_name)
model_intent = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent)

# Classifier Language
model_name = 'qanastek/51-languages-classifier'
tokenizer_langs = AutoTokenizer.from_pretrained(model_name)
model_langs = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs)

# NER Extractor
model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU'
tokenizer_ner = AutoTokenizer.from_pretrained(model_name)
model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)

EXAMPLE_DIR = './'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))

def transcribe(audio_path):
        
    speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)

    inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    
    return processor_asr.batch_decode(predicted_ids)[0]

def getUniform(text):

    idx = 0
    res = {}

    for t in text:

        raw = t["entity"].replace("B-","").replace("I-","")
        word = t["word"].replace("▁","")

        if "B-" in t["entity"]:
            res[f"{raw}|{idx}"] = [word]
            idx += 1
        else:
            res[f"{raw}|{idx}"].append(word)

    res = [(r.split("|")[0], res[r]) for r in res]

    return res


def process(path):
    
    text = transcribe(path)

    intent_class = classifier_intent(text)[0]["label"]
    language_class = classifier_language(text)[0]["label"]
    named_entities = getUniform(predict_ner(text))

    return {
        "text": text,
        "language": language_class,
        "intent_class": intent_class,
        "named_entities": named_entities,
    }

audio_paths = [
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/set-the-volume-to-low.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/tell-me-a-joke.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/tell me the artist of this song.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/order-a-pizza.wav",

    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/TTS_1/tell-me-a-good-joke.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/TTS_1/order me a pizza.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/TTS_1/tell-me-the-artist-of-this-song.wav",
]

def predict(wav_file):
    res = process(wav_file)
    return res

# iface = gr.Interface(fn=predict, inputs="text", outputs="text")

iface = gr.Interface(
    predict,
    title='Alexa NLU Clone',
    description='Upload your wav file to test the model',
    inputs=[
        gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
    ],
    outputs=[
        gr.outputs.JSON(label='Slot Recognition + Intent Classification + Language Classification + ASR'),
    ],
    examples=examples,
    article='Made with ❀️ by Yanis Labrak thanks to πŸ€—',
)

iface.launch()