File size: 2,827 Bytes
b2e30fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import csv
import re
import os
import yt_dlp
import whisper
from tqdm import tqdm
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


# Folder to save audio and transcripts
audio_folder = 'audio_files'
transcripts_folder = 'transcripts'


def sanitize_filename(filename):
    """
    Removes invalid characters from filename and truncates it if it's too long.
    """
    s = re.sub(r'[\\/*?:"<>|]', '', filename)
    if len(s) > 200:
        s = s[:200]
    return s


def download_audio_with_yt_dlp(video_id):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3'
        }],
        'outtmpl': f'{audio_folder}/{video_id}.%(ext)s'
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([f'https://www.youtube.com/watch?v={video_id}'])


def transcribe_audio_with_whisper(audio_file):
    model = whisper.load_model("base")
    result = model.transcribe(audio_file)
    return result["text"]


def process_video(video_id):
    audio_file = f"{audio_folder}/{video_id}.mp3"

    # Download audio
    download_audio_with_yt_dlp(video_id)

    # Transcribe audio
    transcript = transcribe_audio_with_whisper(audio_file)

    # Save the transcript
    with open(f"{transcripts_folder}/{video_id}.txt", 'w', encoding="utf-8") as outfile:
        outfile.write(transcript)


def download_audio_from_playlist(playlist_url):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': f'{audio_folder}/%(id)s.%(ext)s',
        'ignoreerrors': True,  # Continue on download errors
        'extract_flat': True,  # Just get video IDs from the playlist
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(playlist_url, download=False)
        for video in info_dict['entries']:
            if video:  # Video is not None or deleted
                process_video(video['id'])


def download_transcripts_from_csv(file_path):
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in tqdm(reader):
            url = row[0]
            if "list=" in url:
                # It's a playlist, process each video in the playlist
                download_audio_from_playlist(url)
            else:
                # It's a single video
                video_id = url.split('v=')[-1]
                process_video(video_id)


if __name__ == "__main__":
    # Ensure folders exist
    os.makedirs(audio_folder, exist_ok=True)
    os.makedirs(transcripts_folder, exist_ok=True)

    csv_path = 'urls.csv'
    download_transcripts_from_csv(csv_path)