sakshigpatil's picture
add python files that includes whisper, gtts, wev2vec2, sounddevice
428ab68 verified
```python
import whisper
import tempfile
import os
class WhisperTranscriber:
def __init__(self, model_size="base"):
# Available models: tiny, base, small, medium, large
self.model = whisper.load_model(model_size)
def transcribe_audio(self, audio_path, language=None):
"""
Transcribe audio using Whisper model
Args:
audio_path (str): Path to audio file
language (str, optional): Language code (e.g., 'en'). If None, auto-detect
Returns:
dict: Transcription result containing text, segments, language, etc.
"""
result = self.model.transcribe(audio_path, language=language)
return result
def transcribe_bytes(self, audio_bytes, temp_prefix="whisper_temp"):
"""
Transcribe raw audio bytes by saving to temporary file
Args:
audio_bytes (bytes): Raw audio data
temp_prefix (str): Prefix for temporary file
Returns:
dict: Transcription result
"""
with tempfile.NamedTemporaryFile(prefix=temp_prefix, delete=True) as temp_file:
temp_file.write(audio_bytes)
temp_file.flush()
return self.transcribe_audio(temp_file.name)
def detect_language(self, audio_path):
"""
Detect the language of the audio
Args:
audio_path (str): Path to audio file
Returns:
str: Language code (e.g., 'en')
"""
# Load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
# Make log-Mel spectrogram and move to device
mel = whisper.log_mel_spectrogram(audio).to(self.model.device)
# Detect language
_, probs = self.model.detect_language(mel)
return max(probs, key=probs.get)
def transcribe_with_timestamps(self, audio_path):
"""
Get transcription with word-level timestamps
Args:
audio_path (str): Path to audio file
Returns:
dict: Transcription with word-level timestamps
"""
result = self.model.transcribe(audio_path, word_timestamps=True)
return result
```