Spaces:

archit11
/

yt-chunks

Sleeping

App Files Files Community

archit11 commited on Jun 3

Commit

957fb6e

•

1 Parent(s): e21f6d3

Create transcriber.py

Browse files

Files changed (1) hide show

transcriber.py +63 -0

transcriber.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import yt_dlp as youtube_dl
+def download_youtube_audio(url, output_path, preferred_quality="192"):
+  ydl_opts = {
+      'format': 'bestaudio/best',  # Select best audio quality
+      'postprocessors': [{
+          'key': 'FFmpegExtractAudio',
+          'preferredcodec': 'mp3',
+          'preferredquality': preferred_quality,
+      }],
+      'outtmpl': output_path,  # Specify the output path and file name
+  }
+  try:
+      with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+          info_dict = ydl.extract_info(url, download=False)
+          video_title = info_dict.get('title', None)
+          print(f"Downloading audio for: {video_title}")
+          ydl.download([url])
+          print(f"Audio file saved as: {output_path}")
+      return output_path
+  except youtube_dl.utils.DownloadError as e:
+      print(f"Error downloading audio: {e}")
+      return None  # Indicate failure
+def transcribe(path ,model):
+    model = WhisperModel(model)
+    print(f"reading {path}")
+    segments, info = model.transcribe(path)
+    return segments
+def process_segments(segments: Generator):
+    result = {}
+    print("processing...")
+    for i, segment in enumerate(segments):
+        chunk_id = f"chunk_{i}"
+        result[chunk_id] = {
+            'chunk_id': segment.id,
+            'chunk_length': segment.end - segment.start,
+            'text': segment.text,
+            'start_time': segment.start,
+            'end_time': segment.end
+        }
+    df = pd.DataFrame.from_dict(result, orient='index')
+    return df
+def gen_csv():
+    df = process_segments(transcribe(download_youtube_audio("https://www.youtube.com/watch?v=Sby1uJ_NFIY", path), "distil-large-v3"))
+    df.to_csv('alo.csv')