pratikshahp commited on
Commit
8b331ad
1 Parent(s): 5c92be7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -9
app.py CHANGED
@@ -54,9 +54,6 @@ import numpy as np
54
  # Load model directly
55
  from transformers import AutoProcessor, AutoModelForPreTraining
56
 
57
- processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
58
- model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base")
59
-
60
  def transcribe_audio(audio_bytes):
61
  # processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
62
  # model = AutoModelForSpeechSeq2Seq.from_pretrained("facebook/s2t-small-librispeech-asr")
@@ -64,20 +61,14 @@ def transcribe_audio(audio_bytes):
64
  model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base")
65
  # Convert audio bytes to numpy array
66
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
67
-
68
  # Normalize audio array
69
  audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
70
-
71
  # Provide inputs to the processor
72
- #inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt")
73
  input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
74
-
75
  # generate token ids
76
  predicted_ids = model.generate(input_features)
77
  # decode token ids to text
78
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
79
-
80
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
81
  return transcription
82
 
83
  # Streamlit app
 
54
  # Load model directly
55
  from transformers import AutoProcessor, AutoModelForPreTraining
56
 
 
 
 
57
  def transcribe_audio(audio_bytes):
58
  # processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
59
  # model = AutoModelForSpeechSeq2Seq.from_pretrained("facebook/s2t-small-librispeech-asr")
 
61
  model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base")
62
  # Convert audio bytes to numpy array
63
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
 
64
  # Normalize audio array
65
  audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0
 
66
  # Provide inputs to the processor
 
67
  input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features
 
68
  # generate token ids
69
  predicted_ids = model.generate(input_features)
70
  # decode token ids to text
71
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
 
 
72
  return transcription
73
 
74
  # Streamlit app