#speech to text tool import os import subprocess import whisperx from pydub import AudioSegment from pytube import YouTube class SpeechToText: def __init__( self, video_url, audio_format='mp3', device='cuda', batch_size = 16, compute_type = "float16", hf_api_key = None ): """ # Example usage video_url = "url" speech_to_text = SpeechToText(video_url) transcription = speech_to_text.transcribe_youtube_video() print(transcription) """ self.video_url = video_url self.audio_format = audio_format self.device = device self.batch_size = batch_size self.compute_type = compute_type self.hf_api_key = hf_api_key def install(self): subprocess.run(["pip", "install", "whisperx"]) subprocess.run(["pip", "install", "pytube"]) subprocess.run(["pip", "install", "pydub"]) def download_youtube_video(self): audio_file = f'video.{self.audio_format}' # Download video 📥 yt = YouTube(self.video_url) yt_stream = yt.streams.filter(only_audio=True).first() yt_stream.download(filename='video.mp4') # Convert video to audio 🎧 video = AudioSegment.from_file("video.mp4", format="mp4") video.export(audio_file, format=self.audio_format) os.remove("video.mp4") return audio_file def transcribe_youtube_video(self): audio_file = self.download_youtube_video() device = "cuda" batch_size = 16 compute_type = "float16" # 1. Transcribe with original Whisper (batched) 🗣️ model = whisperx.load_model("large-v2", device, compute_type=compute_type) audio = whisperx.load_audio(audio_file) result = model.transcribe(audio, batch_size=batch_size) # 2. Align Whisper output 🔍 model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) # 3. Assign speaker labels 🏷️ diarize_model = whisperx.DiarizationPipeline( use_auth_token=self.hf_api_key, device=device ) diarize_model(audio_file) try: segments = result["segments"] transcription = " ".join(segment['text'] for segment in segments) return transcription except KeyError: print("The key 'segments' is not found in the result.") def transcribe(self, audio_file): model = whisperx.load_model( "large-v2", self.device, self.compute_type ) audio = whisperx.load_audio(audio_file) result = model.transcribe( audio, batch_size=self.batch_size ) # 2. Align Whisper output 🔍 model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align( result["segments"], model_a, metadata, audio, self.device, return_char_alignments=False ) # 3. Assign speaker labels 🏷️ diarize_model = whisperx.DiarizationPipeline( use_auth_token=self.hf_api_key, device=self.device ) diarize_model(audio_file) try: segments = result["segments"] transcription = " ".join(segment['text'] for segment in segments) return transcription except KeyError: print("The key 'segments' is not found in the result.")