Ahsen Khaliq commited on
Commit
94602da
1 Parent(s): f7291a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -5
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
  from transformers import Wav2Vec2Processor, HubertForCTC
3
- import soundfile as sf
4
  import gradio as gr
5
  from moviepy.editor import *
6
  import cv2
 
7
 
8
  def get_optimal_font_scale(text, width):
9
  for scale in reversed(range(0, 60, 1)):
@@ -16,11 +16,12 @@ def get_optimal_font_scale(text, width):
16
 
17
  processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft")
18
  model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft")
19
- def map_to_array(file):
20
- speech, _ = sf.read(file)
21
- return speech
22
  def inference(audio, image):
23
- input_values = processor(map_to_array(audio.name), return_tensors="pt").input_values # Batch size 1
 
24
  logits = model(input_values).logits
25
  predicted_ids = torch.argmax(logits, dim=-1)
26
  transcription = processor.decode(predicted_ids[0])
1
  import torch
2
  from transformers import Wav2Vec2Processor, HubertForCTC
 
3
  import gradio as gr
4
  from moviepy.editor import *
5
  import cv2
6
+ import librosa
7
 
8
  def get_optimal_font_scale(text, width):
9
  for scale in reversed(range(0, 60, 1)):
16
 
17
  processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft")
18
  model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft")
19
+ #def map_to_array(file):
20
+ #speech, _ = sf.read(file)
21
+ #return speech
22
  def inference(audio, image):
23
+ y, sr = librosa.load(audio.name,sr=16000)
24
+ input_values = processor(y, return_tensors="pt").input_values # Batch size 1
25
  logits = model(input_values).logits
26
  predicted_ids = torch.argmax(logits, dim=-1)
27
  transcription = processor.decode(predicted_ids[0])