archit11 commited on
Commit
957fb6e
1 Parent(s): e21f6d3

Create transcriber.py

Browse files
Files changed (1) hide show
  1. transcriber.py +63 -0
transcriber.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp as youtube_dl
2
+
3
+
4
+ def download_youtube_audio(url, output_path, preferred_quality="192"):
5
+ ydl_opts = {
6
+ 'format': 'bestaudio/best', # Select best audio quality
7
+ 'postprocessors': [{
8
+ 'key': 'FFmpegExtractAudio',
9
+ 'preferredcodec': 'mp3',
10
+ 'preferredquality': preferred_quality,
11
+ }],
12
+ 'outtmpl': output_path, # Specify the output path and file name
13
+ }
14
+
15
+ try:
16
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
17
+ info_dict = ydl.extract_info(url, download=False)
18
+ video_title = info_dict.get('title', None)
19
+ print(f"Downloading audio for: {video_title}")
20
+
21
+ ydl.download([url])
22
+ print(f"Audio file saved as: {output_path}")
23
+
24
+ return output_path
25
+
26
+ except youtube_dl.utils.DownloadError as e:
27
+ print(f"Error downloading audio: {e}")
28
+ return None # Indicate failure
29
+
30
+
31
+ def transcribe(path ,model):
32
+ model = WhisperModel(model)
33
+
34
+ print(f"reading {path}")
35
+ segments, info = model.transcribe(path)
36
+ return segments
37
+
38
+
39
+
40
+ def process_segments(segments: Generator):
41
+ result = {}
42
+ print("processing...")
43
+ for i, segment in enumerate(segments):
44
+ chunk_id = f"chunk_{i}"
45
+
46
+ result[chunk_id] = {
47
+ 'chunk_id': segment.id,
48
+ 'chunk_length': segment.end - segment.start,
49
+ 'text': segment.text,
50
+ 'start_time': segment.start,
51
+ 'end_time': segment.end
52
+ }
53
+ df = pd.DataFrame.from_dict(result, orient='index')
54
+
55
+ return df
56
+
57
+
58
+
59
+ def gen_csv():
60
+ df = process_segments(transcribe(download_youtube_audio("https://www.youtube.com/watch?v=Sby1uJ_NFIY", path), "distil-large-v3"))
61
+ df.to_csv('alo.csv')
62
+
63
+