barto17 commited on
Commit
4f3f83c
1 Parent(s): 25a955c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
5
+ from transformers.models.whisper.tokenization_whisper import LANGUAGES
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+
8
+ import gradio as gr
9
+
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "CPU"
12
+
13
+ model_ckpt = "ivanlau/language-detection-fine-tuned-on-xlm-roberta-base"
14
+ model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
15
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
16
+
17
+ def detect_language(sentence):
18
+ tokenized_sentence = tokenizer(sentence, return_tensors='pt')
19
+ output = model(**tokenized_sentence)
20
+ predictions = torch.nn.functional.softmax(output.logits, dim=-1)
21
+ probability, pred_idx = torch.max(predictions, dim=-1)
22
+ language = LANGUANGE_MAP[pred_idx.item()]
23
+ return language, probability.item()
24
+
25
+
26
+ def process_audio_file(file):
27
+ with open(file, "rb") as f:
28
+ inputs = f.read()
29
+
30
+ audio = ffmpeg_read(inputs, sampling_rate)
31
+ return audio
32
+
33
+ def transcribe(Microphone, File_Upload):
34
+ warn_output = ""
35
+ if (Microphone is not None) and (File_Upload is not None):
36
+ warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
37
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
38
+ file = Microphone
39
+
40
+ elif (Microphone is None) and (File_Upload is None):
41
+ return "ERROR: You have to either use the microphone or upload an audio file"
42
+
43
+ elif Microphone is not None:
44
+ file = Microphone
45
+ else:
46
+ file = File_Upload
47
+
48
+ audio_data = process_audio_file(file)
49
+
50
+ input_features = processor(audio_data, return_tensors="pt").input_features
51
+
52
+ with torch.no_grad():
53
+ logits = model.forward(input_features.to(device), decoder_input_ids=decoder_input_ids).logits
54
+
55
+ pred_ids = torch.argmax(logits, dim=-1)
56
+ transcription = processor.decode(pred_ids[0])
57
+
58
+ detect_language(transcription.capitalize())
59
+
60
+
61
+ examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
62
+
63
+ outputs=gr.outputs.Label(label="Language detected:")
64
+ article = """
65
+ Fine-tuned on xlm-roberta-base model.\n
66
+ Supported languages:\n
67
+ 'Arabic', 'Basque', 'Breton', 'Catalan', 'Chinese_China', 'Chinese_Hongkong', 'Chinese_Taiwan', 'Chuvash', 'Czech',
68
+ 'Dhivehi', 'Dutch', 'English', 'Esperanto', 'Estonian', 'French', 'Frisian', 'Georgian', 'German', 'Greek', 'Hakha_Chin',
69
+ 'Indonesian', 'Interlingua', 'Italian', 'Japanese', 'Kabyle', 'Kinyarwanda', 'Kyrgyz', 'Latvian', 'Maltese',
70
+ 'Mangolian', 'Persian', 'Polish', 'Portuguese', 'Romanian', 'Romansh_Sursilvan', 'Russian', 'Sakha', 'Slovenian',
71
+ 'Spanish', 'Swedish', 'Tamil', 'Tatar', 'Turkish', 'Ukranian', 'Welsh'
72
+ """
73
+
74
+ gr.Interface(
75
+ fn=detect_language,
76
+ fn=transcribe,
77
+ inputs=[
78
+ gr.inputs.Audio(source="microphone", type='filepath', optional=True),
79
+ gr.inputs.Audio(source="upload", type='filepath', optional=True),
80
+ ],
81
+
82
+ outputs=outputs=[
83
+ gr.outputs.Textbox(label="Language"),
84
+ gr.Number(label="Probability"),
85
+ ],
86
+
87
+ verbose=True,
88
+ examples = examples,
89
+ title="Language Identification from Audio",
90
+ description="Detect the Language from Audio.",
91
+ article=article,
92
+ theme="huggingface"
93
+ ).launch()
94
+
95
+
96
+