File size: 2,600 Bytes
9c1145c
21728f1
 
 
 
 
 
 
 
 
 
 
 
 
4b7e966
21728f1
 
4b7e966
21728f1
 
 
44d4c43
 
 
21728f1
 
 
a97a975
21728f1
 
156ffda
21728f1
44d4c43
 
 
2f6b008
21728f1
3b36209
 
4b7e966
cb4e060
3b36209
 
 
21728f1
 
 
 
 
 
 
c5d4c36
7ea5247
3b36209
21728f1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr 
import librosa 
from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline


def load_and_fix_data(input_file, model_sampling_rate):
    speech, sample_rate = librosa.load(input_file)
    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1]
    if sample_rate != model_sampling_rate:
        speech = librosa.resample(speech, sample_rate, model_sampling_rate)
    return speech


feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-xls-r-1b-spanish")
sampling_rate = feature_extractor.sampling_rate

asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-xls-r-1b-spanish")



model_name = 'hackathon-pln-es/t5-small-finetuned-spanish-to-quechua'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

new_line = '\n'


def predict_and_ctc_lm_decode(input_file):
    speech = load_and_fix_data(input_file, sampling_rate)
    transcribed_text = asr(speech, chunk_length_s=10, stride_length_s=1)
    transcribed_text = transcribed_text["text"]
    input = tokenizer(transcribed_text, return_tensors="pt")
    output = model.generate(input["input_ids"], max_length=40, num_beams=4, early_stopping=True)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return f"Spanish Audio Transcription:   {transcribed_text} {new_line} Quechua Translation:  {output}"

description = """ This is a Gradio demo of Spanish Audio Transcriptions to Quechua Translation. To use this, simply provide an audio input (audio recording or via microphone), which will subsequently be transcribed and translated to the Quechua language.

Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish)

Pre-trained model used for translating Spanish audio transcription to the Quechua language: [t5-small-finetuned-spanish-to-quechua](https://huggingface.co/hackathon-pln-es/t5-small-finetuned-spanish-to-quechua)

"""

gr.Interface(
    predict_and_ctc_lm_decode,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
    ],
    outputs=[gr.outputs.Textbox()],
    examples=[["sunny_day.wav"], ["travel.wav"], ["sample_audio.wav"]],
    title="Spanish-Audio-Transcriptions-to-Quechua-Translation",
    description = description,
    #article="<p><center><img src='........e'></center></p>",
    layout="horizontal",
    theme="huggingface",
).launch(enable_queue=True, cache_examples=True)