File size: 1,710 Bytes
957fb6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import yt_dlp as youtube_dl


def download_youtube_audio(url, output_path, preferred_quality="192"):
  ydl_opts = {
      'format': 'bestaudio/best',  # Select best audio quality
      'postprocessors': [{
          'key': 'FFmpegExtractAudio',
          'preferredcodec': 'mp3',
          'preferredquality': preferred_quality,
      }],
      'outtmpl': output_path,  # Specify the output path and file name
  }

  try:
      with youtube_dl.YoutubeDL(ydl_opts) as ydl:
          info_dict = ydl.extract_info(url, download=False)
          video_title = info_dict.get('title', None)
          print(f"Downloading audio for: {video_title}")

          ydl.download([url])
          print(f"Audio file saved as: {output_path}")

      return output_path

  except youtube_dl.utils.DownloadError as e:
      print(f"Error downloading audio: {e}")
      return None  # Indicate failure


def transcribe(path ,model):
    model = WhisperModel(model)

    print(f"reading {path}")
    segments, info = model.transcribe(path)
    return segments



def process_segments(segments: Generator):
    result = {}
    print("processing...")
    for i, segment in enumerate(segments):
        chunk_id = f"chunk_{i}"
        
        result[chunk_id] = {
            'chunk_id': segment.id,
            'chunk_length': segment.end - segment.start,
            'text': segment.text,
            'start_time': segment.start,
            'end_time': segment.end
        }
    df = pd.DataFrame.from_dict(result, orient='index')

    return df



def gen_csv():
    df = process_segments(transcribe(download_youtube_audio("https://www.youtube.com/watch?v=Sby1uJ_NFIY", path), "distil-large-v3"))
    df.to_csv('alo.csv')