Spaces:

sakshigpatil
/

accentify-the-global-tongue-twister

Running

App Files Files Community

accentify-the-global-tongue-twister / backend /whisper_integration.py

sakshigpatil

add python files that includes whisper, gtts, wev2vec2, sounddevice

428ab68 verified 10 days ago

raw

history blame contribute delete

2.34 kB

	```python
	import whisper
	import tempfile
	import os

	class WhisperTranscriber:
	def __init__(self, model_size="base"):
	# Available models: tiny, base, small, medium, large
	self.model = whisper.load_model(model_size)

	def transcribe_audio(self, audio_path, language=None):
	"""
	Transcribe audio using Whisper model

	Args:
	audio_path (str): Path to audio file
	language (str, optional): Language code (e.g., 'en'). If None, auto-detect

	Returns:
	dict: Transcription result containing text, segments, language, etc.
	"""
	result = self.model.transcribe(audio_path, language=language)
	return result

	def transcribe_bytes(self, audio_bytes, temp_prefix="whisper_temp"):
	"""
	Transcribe raw audio bytes by saving to temporary file

	Args:
	audio_bytes (bytes): Raw audio data
	temp_prefix (str): Prefix for temporary file

	Returns:
	dict: Transcription result
	"""
	with tempfile.NamedTemporaryFile(prefix=temp_prefix, delete=True) as temp_file:
	temp_file.write(audio_bytes)
	temp_file.flush()
	return self.transcribe_audio(temp_file.name)

	def detect_language(self, audio_path):
	"""
	Detect the language of the audio

	Args:
	audio_path (str): Path to audio file

	Returns:
	str: Language code (e.g., 'en')
	"""
	# Load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(audio_path)
	audio = whisper.pad_or_trim(audio)

	# Make log-Mel spectrogram and move to device
	mel = whisper.log_mel_spectrogram(audio).to(self.model.device)

	# Detect language
	_, probs = self.model.detect_language(mel)
	return max(probs, key=probs.get)

	def transcribe_with_timestamps(self, audio_path):
	"""
	Get transcription with word-level timestamps

	Args:
	audio_path (str): Path to audio file

	Returns:
	dict: Transcription with word-level timestamps
	"""
	result = self.model.transcribe(audio_path, word_timestamps=True)
	return result
	```