leofltt commited on
Commit
593ca04
1 Parent(s): 25c6a90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -22,7 +22,12 @@ bark_model.to(device)
22
 
23
 
24
  def translate(audio):
25
- inputs = asr_processor(audio, sampling_rate=16000, return_tensors="pt")
 
 
 
 
 
26
  generated_ids = asr_model.generate(inputs["input_features"],attention_mask=inputs["attention_mask"],
27
  forced_bos_token_id=asr_processor.tokenizer.lang_code_to_id['it'],)
28
  translation = asr_processor.batch_decode(generated_ids, skip_special_tokens=True)
@@ -55,7 +60,7 @@ demo = gr.Blocks()
55
 
56
  mic_translate = gr.Interface(
57
  fn=speech_to_speech_translation,
58
- inputs=gr.Audio(source="microphone", type="filepath"),
59
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
60
  title=title,
61
  description=description,
@@ -63,7 +68,7 @@ mic_translate = gr.Interface(
63
 
64
  file_translate = gr.Interface(
65
  fn=speech_to_speech_translation,
66
- inputs=gr.Audio(source="upload", type="filepath"),
67
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
68
  examples=[["./example.wav"]],
69
  title=title,
 
22
 
23
 
24
  def translate(audio):
25
+ sr, y = audio
26
+ # if sr != 16000:
27
+ # y = torchaudio.resample(y, sr, 16000)
28
+ y = y.astype(np.float32)
29
+ y /= np.max(np.abs(y))
30
+ inputs = asr_processor(y, sampling_rate=16000, return_tensors="pt")
31
  generated_ids = asr_model.generate(inputs["input_features"],attention_mask=inputs["attention_mask"],
32
  forced_bos_token_id=asr_processor.tokenizer.lang_code_to_id['it'],)
33
  translation = asr_processor.batch_decode(generated_ids, skip_special_tokens=True)
 
60
 
61
  mic_translate = gr.Interface(
62
  fn=speech_to_speech_translation,
63
+ inputs=gr.Audio(source="microphone"),
64
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
65
  title=title,
66
  description=description,
 
68
 
69
  file_translate = gr.Interface(
70
  fn=speech_to_speech_translation,
71
+ inputs=gr.Audio(source="upload"),
72
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
73
  examples=[["./example.wav"]],
74
  title=title,