File size: 2,415 Bytes
e5d93c0
 
 
ec5489a
e5d93c0
388e862
ec5489a
e5d93c0
ec5489a
 
 
e5d93c0
89b20be
 
 
e5d93c0
 
 
 
 
ec5489a
89b20be
 
ad145b5
e5d93c0
 
f080e4c
c6dd8c8
e5d93c0
 
c6dd8c8
8390518
e5d93c0
c6dd8c8
e5d93c0
 
c6dd8c8
e5d93c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f080e4c
e5d93c0
 
 
 
7c36a4c
a2d4ecd
5ee0b21
 
e5d93c0
fa0be23
e5d93c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import librosa
from transformers import AutoFeatureExtractor, AutoTokenizer, SpeechEncoderDecoderModel
import torch

model_name = "facebook/wav2vec2-xls-r-2b-22-to-16"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = SpeechEncoderDecoderModel.from_pretrained(model_name).to(device)

if torch.cuda.is_available():
    model.half()

def process_audio_file(file):
    data, sr = librosa.load(file)
    if sr != 16000:
        data = librosa.resample(data, sr, 16000)
    print(data.shape)
    input_values = feature_extractor(data, return_tensors="pt").input_values.to(device)
    
    if torch.cuda.is_available():
        input_values = input_values.to(torch.float16)
    return input_values
    
def transcribe(file_mic, target_language):
    
    target_code = target_language.split("(")[-1].split(")")[0]
    forced_bos_token_id = MAPPING[target_code]
       
    input_values = process_audio_file(file_mic)
    
    sequences = model.generate(input_values, forced_bos_token_id=forced_bos_token_id)
    
    transcription = tokenizer.batch_decode(sequences, skip_special_tokens=True)
    return warn_output + transcription[0]
    
target_language = [
    "English (en)",
    "German (de)",
    "Turkish (tr)",
    "Persian (fa)",
    "Swedish (sv)",
    "Mongolian (mn)",
    "Chinese (zh)",
    "Welsh (cy)",
    "Catalan (ca)",
    "Slovenian (sl)",
    "Estonian (et)",
    "Indonesian (id)",
    "Arabic (ar)",
    "Tamil (ta)",
    "Latvian (lv)",
    "Japanese (ja)",
]

MAPPING = {
    "en": 250004,
    "de": 250003,
    "tr": 250023,
    "fa": 250029,
    "sv": 250042,
    "mn": 250037,
    "zh": 250025,
    "cy": 250007,
    "ca": 250005,
    "sl": 250052,
    "et": 250006,
    "id": 250032,
    "ar": 250001,
    "ta": 250044,
    "lv": 250017,
    "ja": 250012,
}
    
iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type='filepath'),
        gr.inputs.Dropdown(target_language),
    ],
    outputs="text",
    layout="horizontal",
    theme="default",
    description="A simple interface to translate from 22 input spoken languages to 16 written languages built by Meta/Facebook AI.",
    enable_queue=True,
    allow_flagging=False,
)

iface.launch()