vumichien commited on
Commit
08f9ba3
1 Parent(s): f481a94
app.py CHANGED
@@ -2,8 +2,9 @@ import gradio as gr
2
  import librosa
3
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  import torch
 
5
  # config
6
- model_name = "vumichien/wav2vec2-large-xlsr-japanese-hỉragana"
7
  processor = Wav2Vec2Processor.from_pretrained(model_name)
8
  model = Wav2Vec2ForCTC.from_pretrained(model_name)
9
 
@@ -11,44 +12,37 @@ model = Wav2Vec2ForCTC.from_pretrained(model_name)
11
  def process_audio_file(file):
12
  data, sr = librosa.load(file)
13
  if sr != 16000:
14
- data = librosa.resample(data, sr, 16000).squeeze()
15
  print(data.shape)
16
  inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
17
  return inputs
18
 
19
 
20
- def transcribe(file_mic, file_upload):
21
- warn_output = ""
22
- if (file_mic is not None) and (file_upload is not None):
23
- warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the " \
24
- "microphone will be used and the uploaded audio will be discarded.\n "
25
- file = file_mic
26
- elif (file_mic is None) and (file_upload is None):
27
- return "ERROR: You have to either use the microphone or upload an audio file"
28
- elif file_mic is not None:
29
- file = file_mic
30
- else:
31
- file = file_upload
32
  inputs = process_audio_file(file)
33
  with torch.no_grad():
34
- output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask.to("cuda")).logits
35
  pred_ids = torch.argmax(output_logit, dim=-1)
36
- return warn_output + processor.batch_decode(pred_ids)
37
 
38
 
 
 
 
 
 
 
 
39
  iface = gr.Interface(
40
  fn=transcribe,
41
- inputs=[
42
- gr.inputs.Audio(source="microphone", type='filepath', optional=True),
43
- gr.inputs.Audio(source="upload", type='filepath', optional=True),
44
- ],
45
  outputs="text",
46
  layout="horizontal",
47
  theme="huggingface",
48
  title="Transcribe Japanese audio to Hiragana",
49
- description="A simple interface to transcribe from spoken Japanese to Hiragana.",
50
- article="<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-1b-en-to-15' target='_blank'>Click to learn more about XLS-R-1B-EN-15 </a> | <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
51
- enable_queue=True,
52
- allow_flagging=False,
53
  )
54
- iface.launch()
 
2
  import librosa
3
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  import torch
5
+
6
  # config
7
+ model_name = "vumichien/wav2vec2-large-xlsr-japanese-hiragana"
8
  processor = Wav2Vec2Processor.from_pretrained(model_name)
9
  model = Wav2Vec2ForCTC.from_pretrained(model_name)
10
 
 
12
  def process_audio_file(file):
13
  data, sr = librosa.load(file)
14
  if sr != 16000:
15
+ data = librosa.resample(data, sr, 16000)
16
  print(data.shape)
17
  inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
18
  return inputs
19
 
20
 
21
+ def transcribe(file):
 
 
 
 
 
 
 
 
 
 
 
22
  inputs = process_audio_file(file)
23
  with torch.no_grad():
24
+ output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
25
  pred_ids = torch.argmax(output_logit, dim=-1)
26
+ return processor.batch_decode(pred_ids)[0]
27
 
28
 
29
+ description = "A simple interface to transcribe from spoken Japanese to Hiragana."
30
+ article = "<p style='text-align: center'><a @2022 Detomo </a></p>"
31
+ inputs = [gr.inputs.Audio(source="microphone", type='filepath', optional=True)
32
+ ]
33
+ examples = [["samples/BASIC5000_0001.wav"],
34
+ ["samples/BASIC5000_0005.wav"]
35
+ ]
36
  iface = gr.Interface(
37
  fn=transcribe,
38
+ inputs=inputs,
 
 
 
39
  outputs="text",
40
  layout="horizontal",
41
  theme="huggingface",
42
  title="Transcribe Japanese audio to Hiragana",
43
+ description=description,
44
+ article=article,
45
+ allow_flagging='never',
46
+ examples=examples
47
  )
48
+ iface.launch(enable_queue=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio~=2.7.5.2
2
+ librosa~=0.8.1
3
+ torch~=1.10.1
4
+ transformers~=4.15.0
samples/BASIC5000_0001.wav ADDED
Binary file (306 kB). View file
 
samples/BASIC5000_0005.wav ADDED
Binary file (354 kB). View file