import yt_dlp as youtube_dl def download_youtube_audio(url, output_path, preferred_quality="192"): ydl_opts = { 'format': 'bestaudio/best', # Select best audio quality 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': preferred_quality, }], 'outtmpl': output_path, # Specify the output path and file name } try: with youtube_dl.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) video_title = info_dict.get('title', None) print(f"Downloading audio for: {video_title}") ydl.download([url]) print(f"Audio file saved as: {output_path}") return output_path except youtube_dl.utils.DownloadError as e: print(f"Error downloading audio: {e}") return None # Indicate failure def transcribe(path ,model): model = WhisperModel(model) print(f"reading {path}") segments, info = model.transcribe(path) return segments def process_segments(segments: Generator): result = {} print("processing...") for i, segment in enumerate(segments): chunk_id = f"chunk_{i}" result[chunk_id] = { 'chunk_id': segment.id, 'chunk_length': segment.end - segment.start, 'text': segment.text, 'start_time': segment.start, 'end_time': segment.end } df = pd.DataFrame.from_dict(result, orient='index') return df def gen_csv(): df = process_segments(transcribe(download_youtube_audio("https://www.youtube.com/watch?v=Sby1uJ_NFIY", path), "distil-large-v3")) df.to_csv('alo.csv')