File size: 2,827 Bytes
b2e30fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import csv
import re
import os
import yt_dlp
import whisper
from tqdm import tqdm
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# Folder to save audio and transcripts
audio_folder = 'audio_files'
transcripts_folder = 'transcripts'
def sanitize_filename(filename):
"""
Removes invalid characters from filename and truncates it if it's too long.
"""
s = re.sub(r'[\\/*?:"<>|]', '', filename)
if len(s) > 200:
s = s[:200]
return s
def download_audio_with_yt_dlp(video_id):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3'
}],
'outtmpl': f'{audio_folder}/{video_id}.%(ext)s'
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([f'https://www.youtube.com/watch?v={video_id}'])
def transcribe_audio_with_whisper(audio_file):
model = whisper.load_model("base")
result = model.transcribe(audio_file)
return result["text"]
def process_video(video_id):
audio_file = f"{audio_folder}/{video_id}.mp3"
# Download audio
download_audio_with_yt_dlp(video_id)
# Transcribe audio
transcript = transcribe_audio_with_whisper(audio_file)
# Save the transcript
with open(f"{transcripts_folder}/{video_id}.txt", 'w', encoding="utf-8") as outfile:
outfile.write(transcript)
def download_audio_from_playlist(playlist_url):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': f'{audio_folder}/%(id)s.%(ext)s',
'ignoreerrors': True, # Continue on download errors
'extract_flat': True, # Just get video IDs from the playlist
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(playlist_url, download=False)
for video in info_dict['entries']:
if video: # Video is not None or deleted
process_video(video['id'])
def download_transcripts_from_csv(file_path):
with open(file_path, 'r') as csvfile:
reader = csv.reader(csvfile)
for row in tqdm(reader):
url = row[0]
if "list=" in url:
# It's a playlist, process each video in the playlist
download_audio_from_playlist(url)
else:
# It's a single video
video_id = url.split('v=')[-1]
process_video(video_id)
if __name__ == "__main__":
# Ensure folders exist
os.makedirs(audio_folder, exist_ok=True)
os.makedirs(transcripts_folder, exist_ok=True)
csv_path = 'urls.csv'
download_transcripts_from_csv(csv_path)
|