|
import csv |
|
import re |
|
import os |
|
import yt_dlp |
|
import whisper |
|
from tqdm import tqdm |
|
import ssl |
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context |
|
|
|
|
|
|
|
audio_folder = 'audio_files' |
|
transcripts_folder = 'transcripts' |
|
|
|
|
|
def sanitize_filename(filename): |
|
""" |
|
Removes invalid characters from filename and truncates it if it's too long. |
|
""" |
|
s = re.sub(r'[\\/*?:"<>|]', '', filename) |
|
if len(s) > 200: |
|
s = s[:200] |
|
return s |
|
|
|
|
|
def download_audio_with_yt_dlp(video_id): |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'postprocessors': [{ |
|
'key': 'FFmpegExtractAudio', |
|
'preferredcodec': 'mp3' |
|
}], |
|
'outtmpl': f'{audio_folder}/{video_id}.%(ext)s' |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
ydl.download([f'https://www.youtube.com/watch?v={video_id}']) |
|
|
|
|
|
def transcribe_audio_with_whisper(audio_file): |
|
model = whisper.load_model("base") |
|
result = model.transcribe(audio_file) |
|
return result["text"] |
|
|
|
|
|
def process_video(video_id): |
|
audio_file = f"{audio_folder}/{video_id}.mp3" |
|
|
|
|
|
download_audio_with_yt_dlp(video_id) |
|
|
|
|
|
transcript = transcribe_audio_with_whisper(audio_file) |
|
|
|
|
|
with open(f"{transcripts_folder}/{video_id}.txt", 'w', encoding="utf-8") as outfile: |
|
outfile.write(transcript) |
|
|
|
|
|
def download_audio_from_playlist(playlist_url): |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'postprocessors': [{ |
|
'key': 'FFmpegExtractAudio', |
|
'preferredcodec': 'mp3', |
|
'preferredquality': '192', |
|
}], |
|
'outtmpl': f'{audio_folder}/%(id)s.%(ext)s', |
|
'ignoreerrors': True, |
|
'extract_flat': True, |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
info_dict = ydl.extract_info(playlist_url, download=False) |
|
for video in info_dict['entries']: |
|
if video: |
|
process_video(video['id']) |
|
|
|
|
|
def download_transcripts_from_csv(file_path): |
|
with open(file_path, 'r') as csvfile: |
|
reader = csv.reader(csvfile) |
|
for row in tqdm(reader): |
|
url = row[0] |
|
if "list=" in url: |
|
|
|
download_audio_from_playlist(url) |
|
else: |
|
|
|
video_id = url.split('v=')[-1] |
|
process_video(video_id) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
os.makedirs(audio_folder, exist_ok=True) |
|
os.makedirs(transcripts_folder, exist_ok=True) |
|
|
|
csv_path = 'urls.csv' |
|
download_transcripts_from_csv(csv_path) |
|
|