Theo Alves Da Costa commited on
Commit
85d8489
1 Parent(s): 10b3d4f

First commit

Browse files
.github/workflows/check.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/lfs-warning@v2.0
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/main.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://dataforgood:$HF_TOKEN@huggingface.co/spaces/dataforgood/bechdelai-demo main
.gitignore CHANGED
@@ -6,6 +6,9 @@ __pycache__/
6
  # C extensions
7
  *.so
8
 
 
 
 
9
  # Distribution / packaging
10
  .Python
11
  build/
6
  # C extensions
7
  *.so
8
 
9
+ *.wav
10
+ *.mp4
11
+
12
  # Distribution / packaging
13
  .Python
14
  build/
README.md CHANGED
@@ -1 +1 @@
1
- # bechdelai-demo
1
+ # bechdelai-tool-demo
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inspired from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization/blob/main/app.py
2
+
3
+ import whisper
4
+ import datetime
5
+ import subprocess
6
+ import gradio as gr
7
+ from pathlib import Path
8
+ import pandas as pd
9
+ import re
10
+ import time
11
+ import os
12
+ import numpy as np
13
+
14
+ from pytube import YouTube
15
+ import torch
16
+ # import pyannote.audio
17
+ # from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
18
+ # from pyannote.audio import Audio
19
+ # from pyannote.core import Segment
20
+ # from sklearn.cluster import AgglomerativeClustering
21
+
22
+ from gpuinfo import GPUInfo
23
+
24
+ import wave
25
+ import contextlib
26
+ from transformers import pipeline
27
+ import psutil
28
+
29
+ # Custom code
30
+ from bechdelaidemo.utils import download_youtube_video
31
+ from bechdelaidemo.utils import extract_audio_from_movie
32
+
33
+ # Constants
34
+ whisper_models = ["tiny.en","base.en","tiny","base", "small", "medium", "large"]
35
+ device = 0 if torch.cuda.is_available() else "cpu"
36
+ os.makedirs('output', exist_ok=True)
37
+
38
+ # Prepare embedding model
39
+ # embedding_model = PretrainedSpeakerEmbedding(
40
+ # "speechbrain/spkrec-ecapa-voxceleb",
41
+ # device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
42
+
43
+ def get_youtube(video_url):
44
+ yt = YouTube(video_url)
45
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
46
+ print("Success download video")
47
+ print(abs_video_path)
48
+ return abs_video_path
49
+
50
+ def _return_yt_html_embed(yt_url):
51
+ video_id = yt_url.split("?v=")[-1]
52
+ HTML_str = (
53
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
54
+ " </center>"
55
+ )
56
+ return HTML_str
57
+
58
+
59
+ def speech_to_text(video_filepath, selected_source_lang = "en", whisper_model = "tiny.en"):
60
+ """
61
+ # Transcribe youtube link using OpenAI Whisper
62
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
63
+ 2. Generating speaker embeddings for each segments.
64
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
65
+
66
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
67
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
68
+ """
69
+
70
+ time_start = time.time()
71
+
72
+ # Convert video to audio
73
+ audio_filepath = extract_audio_from_movie(video_filepath,".wav")
74
+
75
+ # Load whisper
76
+ model = whisper.load_model(whisper_model)
77
+
78
+ # Get duration
79
+ with contextlib.closing(wave.open(audio_filepath,'r')) as f:
80
+ frames = f.getnframes()
81
+ rate = f.getframerate()
82
+ duration = frames / float(rate)
83
+ print(f"conversion to wav ready, duration of audio file: {duration}")
84
+
85
+ # Transcribe audio
86
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
87
+ transcribe_options = dict(task="transcribe", **options)
88
+ result = model.transcribe(audio_filepath, **transcribe_options)
89
+ segments = result["segments"]
90
+ text = result["text"].strip()
91
+ print("starting whisper done with whisper")
92
+
93
+ return [text]
94
+
95
+ source_language_list = ["en","fr"]
96
+
97
+ # ---- Gradio Layout -----
98
+ # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
99
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
100
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
101
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
102
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="tiny.en", label="Selected Whisper model", interactive=True)
103
+ # transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
104
+ output_text = gr.Textbox(label = "Transcribed text",lines = 10)
105
+
106
+ title = "BechdelAI - demo"
107
+ demo = gr.Blocks(title=title,live = True)
108
+ demo.encrypt = False
109
+
110
+
111
+ with demo:
112
+ with gr.Tab("BechdelAI - dialogue demo"):
113
+ gr.Markdown('''
114
+ <div>
115
+ <h1 style='text-align: center'>BechdelAI - Dialogue demo</h1>
116
+ # <img src="logo.png" style="width:200px"/>
117
+ </div>
118
+ ''')
119
+
120
+ with gr.Row():
121
+ gr.Markdown('''
122
+ ### Transcribe youtube link using OpenAI Whisper
123
+ ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
124
+ ##### 2. Generating speaker embeddings for each segments.
125
+ ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
126
+ ''')
127
+
128
+
129
+
130
+ with gr.Row():
131
+
132
+ with gr.Column():
133
+ # gr.Markdown('''### You can test by following examples:''')
134
+ examples = gr.Examples(examples=
135
+ [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
136
+ "https://www.youtube.com/watch?v=-UX0X45sYe4",
137
+ "https://www.youtube.com/watch?v=7minSgqi-Gw"],
138
+ label="Examples", inputs=[youtube_url_in])
139
+
140
+ with gr.Column():
141
+ youtube_url_in.render()
142
+ download_youtube_btn = gr.Button("Download Youtube video")
143
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
144
+ video_in])
145
+ print(video_in)
146
+
147
+ with gr.Column():
148
+ youtube_url_in.render()
149
+ download_youtube_btn = gr.Button("Download Youtube video")
150
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
151
+ video_in])
152
+ print(video_in)
153
+
154
+ with gr.Column():
155
+ video_in.render()
156
+
157
+ with gr.Row():
158
+ with gr.Column():
159
+ with gr.Column():
160
+ gr.Markdown('''
161
+ ##### Here you can start the transcription process.
162
+ ##### Please select the source language for transcription.
163
+ ##### You should select a number of speakers for getting better results.
164
+ ''')
165
+ selected_source_lang.render()
166
+ selected_whisper_model.render()
167
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
168
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], [output_text])
169
+
170
+
171
+ # with gr.Row():
172
+ # gr.Markdown('''
173
+ # ##### Here you will get transcription output
174
+ # ##### ''')
175
+
176
+ with gr.Row():
177
+ with gr.Column():
178
+ output_text.render()
179
+
180
+ demo.launch(debug=True)
bechdelaidemo/__init_.py ADDED
File without changes
bechdelaidemo/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytube import YouTube
2
+ import moviepy.editor as mp
3
+
4
+
5
+ def download_youtube_video(link: str, filename: str, caption_language: str = "en") -> None:
6
+ """Download a youtube video with captions given an id
7
+
8
+ Parameters
9
+ ----------
10
+ link : str
11
+ Youtube video link
12
+ filename : str
13
+ File name to save the video and the caption
14
+ caption_language : str
15
+ Language caption to download
16
+
17
+ Raises
18
+ ------
19
+ TypeError
20
+ url must be a string
21
+ ValueError
22
+ url must start with 'http'
23
+ """
24
+ try:
25
+ yt = YouTube(link)
26
+ except:
27
+ print("Connection Error")
28
+ return
29
+
30
+ filename = filename if filename.endswith(".mp4") else filename + ".mp4"
31
+
32
+ try:
33
+ (
34
+ yt.streams.filter(progressive=True, file_extension="mp4")
35
+ .order_by("resolution")
36
+ .desc()
37
+ .first()
38
+ ).download(filename=filename)
39
+
40
+ except Exception as e:
41
+ print("Could not download the video!", e)
42
+
43
+ try:
44
+ captions = {
45
+ k: v
46
+ for k, v in yt.captions.lang_code_index.items()
47
+ if caption_language in k
48
+ }
49
+ for lang, caption in captions.items():
50
+ caption.download(title=f"caption_{lang}", srt=False)
51
+ except Exception as e:
52
+ print("Could not download the caption!", e)
53
+ print("Task Completed!")
54
+
55
+
56
+ def download_youtube_audio(link:str,filename:str = "audio.mp3") -> str:
57
+ yt = YouTube(link)
58
+ stream = yt.streams.filter(only_audio=True)[0]
59
+ stream.download(filename=filename)
60
+ return filename
61
+
62
+
63
+ def import_as_clip(path_to_video: str) -> mp.VideoFileClip:
64
+ """Imports a video file as a VideoFileClip object.
65
+
66
+ Parameters:
67
+ path_to_video (str): Path to a video file.
68
+
69
+ Returns:
70
+ mp.VideoFileClip: VideoFileClip object.
71
+ """
72
+ return mp.VideoFileClip(path_to_video)
73
+
74
+ def extract_audio_from_movie(file: str, extension: str = '.wav') -> None:
75
+ """Extract the audio from a film and save it to a file.
76
+
77
+ The audio is saved in the same directory as the film.
78
+
79
+ Parameters:
80
+ file (str): The name of the film file to extract the audio from.
81
+ extension (str): The file extension of the audio file to save (default is ".wav").
82
+ """
83
+ clip = import_as_clip(file)
84
+ filename = file.split(sep='.')[0] + extension
85
+ clip.audio.write_audiofile(filename)
86
+ return filename
logo.png ADDED
notebooks/20230225 - Demo ODI.ipynb ADDED
@@ -0,0 +1,1675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "5813b894",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "The autoreload extension is already loaded. To reload it, use:\n",
14
+ " %reload_ext autoreload\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "# For developers who want to use the latest development version or the library locally\n",
20
+ "# Use poetry to install dependencies\n",
21
+ "import sys\n",
22
+ "sys.path.append(\"../\") # Or change to the folder to the direction of \n",
23
+ "\n",
24
+ "%load_ext autoreload\n",
25
+ "%autoreload 2\n",
26
+ "\n",
27
+ "import bechdelaidemo"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "markdown",
32
+ "id": "6521fddd",
33
+ "metadata": {},
34
+ "source": [
35
+ "# Bechdel test on a dialogue scene\n",
36
+ "The test is already done on a dialogue scene. So easier, because we don't have to extract dialogue segments from a video"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "markdown",
41
+ "id": "dfa5ac0e",
42
+ "metadata": {},
43
+ "source": [
44
+ "## Test on 2 dialogue scenes"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 1,
50
+ "id": "b657fa08",
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "path1 = \"https://www.youtube.com/watch?v=b2f2Kqt_KcE&ab_channel=Movieclips\" #Devils wears prada\n",
55
+ "path2 = \"https://www.youtube.com/watch?v=FDFdroN7d0w\" #Marriage story"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "markdown",
60
+ "id": "fe733fa4",
61
+ "metadata": {},
62
+ "source": [
63
+ "## Download videos and audios from Youtube\n",
64
+ "Inspiration from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization/blob/main/app.py"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 5,
70
+ "id": "ac260c66",
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "from bechdelaidemo.utils import download_youtube_video"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 6,
80
+ "id": "5aa1ce3d",
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stdout",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "Task Completed!\n"
88
+ ]
89
+ }
90
+ ],
91
+ "source": [
92
+ "download_youtube_video(path2,\"video.mp4\")"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 7,
98
+ "id": "d78d3671",
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "video_path = \"video.mp4\""
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "markdown",
107
+ "id": "999e8c78",
108
+ "metadata": {},
109
+ "source": [
110
+ "## Convert video to audio wav file"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 8,
116
+ "id": "766a386d",
117
+ "metadata": {},
118
+ "outputs": [],
119
+ "source": [
120
+ "from bechdelaidemo.utils import extract_audio_from_movie"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 9,
126
+ "id": "8f5028ec",
127
+ "metadata": {},
128
+ "outputs": [
129
+ {
130
+ "name": "stdout",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "MoviePy - Writing audio in video.wav\n"
134
+ ]
135
+ },
136
+ {
137
+ "name": "stderr",
138
+ "output_type": "stream",
139
+ "text": [
140
+ " "
141
+ ]
142
+ },
143
+ {
144
+ "name": "stdout",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "MoviePy - Done.\n"
148
+ ]
149
+ },
150
+ {
151
+ "name": "stderr",
152
+ "output_type": "stream",
153
+ "text": [
154
+ "\r"
155
+ ]
156
+ }
157
+ ],
158
+ "source": [
159
+ "extract_audio_from_movie(video_path,\".wav\")"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 10,
165
+ "id": "9af64b12",
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": [
169
+ "audio_file = \"video.wav\""
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "markdown",
174
+ "id": "88a8f8f1",
175
+ "metadata": {},
176
+ "source": [
177
+ "## Clean audio\n",
178
+ "- https://huggingface.co/speechbrain/sepformer-wham16k-enhancement\n",
179
+ "- Spleeter"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "markdown",
184
+ "id": "4f74de64",
185
+ "metadata": {},
186
+ "source": [
187
+ "## Extract info using Whisper"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 11,
193
+ "id": "eb33487c",
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "import whisper"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 12,
203
+ "id": "12588878",
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "name": "stderr",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "100%|█████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 90.2MiB/s]\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "model = whisper.load_model(\"tiny.en\")"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": 15,
221
+ "id": "59386a8d",
222
+ "metadata": {
223
+ "scrolled": false
224
+ },
225
+ "outputs": [
226
+ {
227
+ "name": "stdout",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "{'task': 'transcribe', 'language': 'en', 'beam_size': 5, 'best_of': 5}\n"
231
+ ]
232
+ }
233
+ ],
234
+ "source": [
235
+ "## Transcribe audio\n",
236
+ "options = dict(language=\"en\", beam_size=5, best_of=5)\n",
237
+ "transcribe_options = dict(task=\"transcribe\", **options)\n",
238
+ "\n",
239
+ "print(transcribe_options)\n",
240
+ "\n",
241
+ "result = model.transcribe(\"video.wav\", **transcribe_options)\n",
242
+ "segments = result[\"segments\"]"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 16,
248
+ "id": "4a2357b8",
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "data": {
253
+ "text/plain": [
254
+ "\" You're being so much like your father. Do not compare me to my father. I didn't compare you, Dan. I said you were acting like him. You're exactly like your mother. Everything you're complaining about her you're doing. You're suffocating in me. First of all, I love my mother. She was a wonderful mother. Just repeating what you told me. Secondly, how dare you compare my mother into my mother? I'm maybe like my father, but I am not like my mother. You are! And you're like my father. You're also like my mother. You're all the bad things about all of these people. But mostly your mother. When we would lie in bed together, sometimes I would look at you and see her and just feel so gross. I felt appalled when you touched her. You're a slob. I mean all the beds, clothes on the camera, and it steps together with you like anything. You want to heal my skin off. You'll never be happy. And now lay her anywhere. You'll think you found some better opposite guy than me. And in a few years, you rebel against him because you need to have your voice. But you don't want a voice. You just want to fucking complain about not having a voice. I think about being married to you and that woman is a stranger to me. I mean we had a child and a bitch. You've gone back to your life before you met me. It's pathetic. People used to tell me that you were too selfish to be a great artist. And I used to defend you. They were absolutely right. All your best acting is behind you. You're back to being a hack. You got slighted me. You're a fucking villain. And you want to present yourself as a victim because it's a good legal strategy? Fine. But you and I both know you chose this life. You wanted it until you didn't. You used me so you could get out of LA. I didn't use you. You did. And then you blamed me for it. You always made me aware of what I was doing wrong. How I was falling short. Life with you was joyless. Then you had to go and fuck someone. You should be upset that I fucked her. You should be upset that I had a laugh with her. Do you love her? No. But she didn't hate me. You hated me. You hated me. You hated me. You fucked somebody we worked with. You stopped having sex with me in the last year. I never cheated on you. What was cheating on me? But there's so much I could have done. I was a director in my 20s who came from nothing and was suddenly on the cover of fucking Time Out New York. I was hot shit and I wanted to fuck everybody and I didn't. And I loved you and I didn't want to lose you. But I had made my 20s. And I didn't want to lose that too. And I kind of did. And you wanted so much so fast. I didn't even want to get married. Fuck it. There's so much I didn't do. Thanks for that. You're welcome. I can't believe I didn't know you forever. You're fucking insane. And you're fucking winning. Are you kidding me? I'm wanting to be married. I don't already lost. You didn't love me as much as I loved you. What does that have to do with LA? What? You're so merged with your own selfishness. You don't need to identify it and selfishness anymore. You're such a dick. Every day I wake up and I hope you're dead. Dead like it. If I can guarantee every movie, okay? I don't think I'm gonna kill this. As I can hit by a car and die. You don't work with me once you're left alone. I lost you and you 2, 3 4 I lost you. I lost you, 2 4 I lost you, 2 5 I'm sorry. Sheila. I'm sorry. Did you know? You\""
255
+ ]
256
+ },
257
+ "execution_count": 16,
258
+ "metadata": {},
259
+ "output_type": "execute_result"
260
+ }
261
+ ],
262
+ "source": [
263
+ "result[\"text\"]"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 17,
269
+ "id": "26fcf349",
270
+ "metadata": {},
271
+ "outputs": [
272
+ {
273
+ "data": {
274
+ "text/plain": [
275
+ "[{'id': 0,\n",
276
+ " 'seek': 0,\n",
277
+ " 'start': 0.0,\n",
278
+ " 'end': 3.0,\n",
279
+ " 'text': \" You're being so much like your father.\",\n",
280
+ " 'tokens': [921, 821, 852, 523, 881, 588, 534, 2988, 13],\n",
281
+ " 'temperature': 0.0,\n",
282
+ " 'avg_logprob': -0.15723278113370817,\n",
283
+ " 'compression_ratio': 1.9415807560137457,\n",
284
+ " 'no_speech_prob': 0.16063228249549866},\n",
285
+ " {'id': 1,\n",
286
+ " 'seek': 0,\n",
287
+ " 'start': 3.0,\n",
288
+ " 'end': 5.0,\n",
289
+ " 'text': ' Do not compare me to my father.',\n",
290
+ " 'tokens': [2141, 407, 8996, 502, 284, 616, 2988, 13],\n",
291
+ " 'temperature': 0.0,\n",
292
+ " 'avg_logprob': -0.15723278113370817,\n",
293
+ " 'compression_ratio': 1.9415807560137457,\n",
294
+ " 'no_speech_prob': 0.16063228249549866},\n",
295
+ " {'id': 2,\n",
296
+ " 'seek': 0,\n",
297
+ " 'start': 5.0,\n",
298
+ " 'end': 6.0,\n",
299
+ " 'text': \" I didn't compare you, Dan.\",\n",
300
+ " 'tokens': [314, 1422, 470, 8996, 345, 11, 6035, 13],\n",
301
+ " 'temperature': 0.0,\n",
302
+ " 'avg_logprob': -0.15723278113370817,\n",
303
+ " 'compression_ratio': 1.9415807560137457,\n",
304
+ " 'no_speech_prob': 0.16063228249549866},\n",
305
+ " {'id': 3,\n",
306
+ " 'seek': 0,\n",
307
+ " 'start': 6.0,\n",
308
+ " 'end': 7.0,\n",
309
+ " 'text': ' I said you were acting like him.',\n",
310
+ " 'tokens': [314, 531, 345, 547, 7205, 588, 683, 13],\n",
311
+ " 'temperature': 0.0,\n",
312
+ " 'avg_logprob': -0.15723278113370817,\n",
313
+ " 'compression_ratio': 1.9415807560137457,\n",
314
+ " 'no_speech_prob': 0.16063228249549866},\n",
315
+ " {'id': 4,\n",
316
+ " 'seek': 0,\n",
317
+ " 'start': 7.0,\n",
318
+ " 'end': 8.0,\n",
319
+ " 'text': \" You're exactly like your mother.\",\n",
320
+ " 'tokens': [921, 821, 3446, 588, 534, 2802, 13],\n",
321
+ " 'temperature': 0.0,\n",
322
+ " 'avg_logprob': -0.15723278113370817,\n",
323
+ " 'compression_ratio': 1.9415807560137457,\n",
324
+ " 'no_speech_prob': 0.16063228249549866},\n",
325
+ " {'id': 5,\n",
326
+ " 'seek': 0,\n",
327
+ " 'start': 8.0,\n",
328
+ " 'end': 10.0,\n",
329
+ " 'text': \" Everything you're complaining about her you're doing.\",\n",
330
+ " 'tokens': [11391, 345, 821, 18705, 546, 607, 345, 821, 1804, 13],\n",
331
+ " 'temperature': 0.0,\n",
332
+ " 'avg_logprob': -0.15723278113370817,\n",
333
+ " 'compression_ratio': 1.9415807560137457,\n",
334
+ " 'no_speech_prob': 0.16063228249549866},\n",
335
+ " {'id': 6,\n",
336
+ " 'seek': 0,\n",
337
+ " 'start': 10.0,\n",
338
+ " 'end': 12.0,\n",
339
+ " 'text': \" You're suffocating in me.\",\n",
340
+ " 'tokens': [921, 821, 3027, 27123, 287, 502, 13],\n",
341
+ " 'temperature': 0.0,\n",
342
+ " 'avg_logprob': -0.15723278113370817,\n",
343
+ " 'compression_ratio': 1.9415807560137457,\n",
344
+ " 'no_speech_prob': 0.16063228249549866},\n",
345
+ " {'id': 7,\n",
346
+ " 'seek': 0,\n",
347
+ " 'start': 12.0,\n",
348
+ " 'end': 13.0,\n",
349
+ " 'text': ' First of all, I love my mother.',\n",
350
+ " 'tokens': [3274, 286, 477, 11, 314, 1842, 616, 2802, 13],\n",
351
+ " 'temperature': 0.0,\n",
352
+ " 'avg_logprob': -0.15723278113370817,\n",
353
+ " 'compression_ratio': 1.9415807560137457,\n",
354
+ " 'no_speech_prob': 0.16063228249549866},\n",
355
+ " {'id': 8,\n",
356
+ " 'seek': 0,\n",
357
+ " 'start': 13.0,\n",
358
+ " 'end': 15.0,\n",
359
+ " 'text': ' She was a wonderful mother.',\n",
360
+ " 'tokens': [1375, 373, 257, 7932, 2802, 13],\n",
361
+ " 'temperature': 0.0,\n",
362
+ " 'avg_logprob': -0.15723278113370817,\n",
363
+ " 'compression_ratio': 1.9415807560137457,\n",
364
+ " 'no_speech_prob': 0.16063228249549866},\n",
365
+ " {'id': 9,\n",
366
+ " 'seek': 0,\n",
367
+ " 'start': 15.0,\n",
368
+ " 'end': 16.0,\n",
369
+ " 'text': ' Just repeating what you told me.',\n",
370
+ " 'tokens': [2329, 20394, 644, 345, 1297, 502, 13],\n",
371
+ " 'temperature': 0.0,\n",
372
+ " 'avg_logprob': -0.15723278113370817,\n",
373
+ " 'compression_ratio': 1.9415807560137457,\n",
374
+ " 'no_speech_prob': 0.16063228249549866},\n",
375
+ " {'id': 10,\n",
376
+ " 'seek': 0,\n",
377
+ " 'start': 16.0,\n",
378
+ " 'end': 19.0,\n",
379
+ " 'text': ' Secondly, how dare you compare my mother into my mother?',\n",
380
+ " 'tokens': [34276, 11, 703, 16498, 345, 8996, 616, 2802, 656, 616, 2802, 30],\n",
381
+ " 'temperature': 0.0,\n",
382
+ " 'avg_logprob': -0.15723278113370817,\n",
383
+ " 'compression_ratio': 1.9415807560137457,\n",
384
+ " 'no_speech_prob': 0.16063228249549866},\n",
385
+ " {'id': 11,\n",
386
+ " 'seek': 0,\n",
387
+ " 'start': 19.0,\n",
388
+ " 'end': 22.0,\n",
389
+ " 'text': \" I'm maybe like my father, but I am not like my mother.\",\n",
390
+ " 'tokens': [314,\n",
391
+ " 1101,\n",
392
+ " 3863,\n",
393
+ " 588,\n",
394
+ " 616,\n",
395
+ " 2988,\n",
396
+ " 11,\n",
397
+ " 475,\n",
398
+ " 314,\n",
399
+ " 716,\n",
400
+ " 407,\n",
401
+ " 588,\n",
402
+ " 616,\n",
403
+ " 2802,\n",
404
+ " 13],\n",
405
+ " 'temperature': 0.0,\n",
406
+ " 'avg_logprob': -0.15723278113370817,\n",
407
+ " 'compression_ratio': 1.9415807560137457,\n",
408
+ " 'no_speech_prob': 0.16063228249549866},\n",
409
+ " {'id': 12,\n",
410
+ " 'seek': 0,\n",
411
+ " 'start': 22.0,\n",
412
+ " 'end': 23.0,\n",
413
+ " 'text': ' You are!',\n",
414
+ " 'tokens': [921, 389, 0],\n",
415
+ " 'temperature': 0.0,\n",
416
+ " 'avg_logprob': -0.15723278113370817,\n",
417
+ " 'compression_ratio': 1.9415807560137457,\n",
418
+ " 'no_speech_prob': 0.16063228249549866},\n",
419
+ " {'id': 13,\n",
420
+ " 'seek': 0,\n",
421
+ " 'start': 23.0,\n",
422
+ " 'end': 25.0,\n",
423
+ " 'text': \" And you're like my father.\",\n",
424
+ " 'tokens': [843, 345, 821, 588, 616, 2988, 13],\n",
425
+ " 'temperature': 0.0,\n",
426
+ " 'avg_logprob': -0.15723278113370817,\n",
427
+ " 'compression_ratio': 1.9415807560137457,\n",
428
+ " 'no_speech_prob': 0.16063228249549866},\n",
429
+ " {'id': 14,\n",
430
+ " 'seek': 0,\n",
431
+ " 'start': 25.0,\n",
432
+ " 'end': 26.0,\n",
433
+ " 'text': \" You're also like my mother.\",\n",
434
+ " 'tokens': [921, 821, 635, 588, 616, 2802, 13],\n",
435
+ " 'temperature': 0.0,\n",
436
+ " 'avg_logprob': -0.15723278113370817,\n",
437
+ " 'compression_ratio': 1.9415807560137457,\n",
438
+ " 'no_speech_prob': 0.16063228249549866},\n",
439
+ " {'id': 15,\n",
440
+ " 'seek': 0,\n",
441
+ " 'start': 26.0,\n",
442
+ " 'end': 29.0,\n",
443
+ " 'text': \" You're all the bad things about all of these people.\",\n",
444
+ " 'tokens': [921, 821, 477, 262, 2089, 1243, 546, 477, 286, 777, 661, 13],\n",
445
+ " 'temperature': 0.0,\n",
446
+ " 'avg_logprob': -0.15723278113370817,\n",
447
+ " 'compression_ratio': 1.9415807560137457,\n",
448
+ " 'no_speech_prob': 0.16063228249549866},\n",
449
+ " {'id': 16,\n",
450
+ " 'seek': 2900,\n",
451
+ " 'start': 29.0,\n",
452
+ " 'end': 30.0,\n",
453
+ " 'text': ' But mostly your mother.',\n",
454
+ " 'tokens': [887, 4632, 534, 2802, 13],\n",
455
+ " 'temperature': 0.0,\n",
456
+ " 'avg_logprob': -0.22024561564127604,\n",
457
+ " 'compression_ratio': 1.8056338028169014,\n",
458
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
459
+ " {'id': 17,\n",
460
+ " 'seek': 2900,\n",
461
+ " 'start': 30.0,\n",
462
+ " 'end': 32.0,\n",
463
+ " 'text': ' When we would lie in bed together, sometimes I would look at you',\n",
464
+ " 'tokens': [1649,\n",
465
+ " 356,\n",
466
+ " 561,\n",
467
+ " 6486,\n",
468
+ " 287,\n",
469
+ " 3996,\n",
470
+ " 1978,\n",
471
+ " 11,\n",
472
+ " 3360,\n",
473
+ " 314,\n",
474
+ " 561,\n",
475
+ " 804,\n",
476
+ " 379,\n",
477
+ " 345],\n",
478
+ " 'temperature': 0.0,\n",
479
+ " 'avg_logprob': -0.22024561564127604,\n",
480
+ " 'compression_ratio': 1.8056338028169014,\n",
481
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
482
+ " {'id': 18,\n",
483
+ " 'seek': 2900,\n",
484
+ " 'start': 32.0,\n",
485
+ " 'end': 34.0,\n",
486
+ " 'text': ' and see her and just feel so gross.',\n",
487
+ " 'tokens': [290, 766, 607, 290, 655, 1254, 523, 10319, 13],\n",
488
+ " 'temperature': 0.0,\n",
489
+ " 'avg_logprob': -0.22024561564127604,\n",
490
+ " 'compression_ratio': 1.8056338028169014,\n",
491
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
492
+ " {'id': 19,\n",
493
+ " 'seek': 2900,\n",
494
+ " 'start': 34.0,\n",
495
+ " 'end': 36.0,\n",
496
+ " 'text': ' I felt appalled when you touched her.',\n",
497
+ " 'tokens': [314, 2936, 41586, 618, 345, 12615, 607, 13],\n",
498
+ " 'temperature': 0.0,\n",
499
+ " 'avg_logprob': -0.22024561564127604,\n",
500
+ " 'compression_ratio': 1.8056338028169014,\n",
501
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
502
+ " {'id': 20,\n",
503
+ " 'seek': 2900,\n",
504
+ " 'start': 36.0,\n",
505
+ " 'end': 37.0,\n",
506
+ " 'text': \" You're a slob.\",\n",
507
+ " 'tokens': [921, 821, 257, 1017, 672, 13],\n",
508
+ " 'temperature': 0.0,\n",
509
+ " 'avg_logprob': -0.22024561564127604,\n",
510
+ " 'compression_ratio': 1.8056338028169014,\n",
511
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
512
+ " {'id': 21,\n",
513
+ " 'seek': 2900,\n",
514
+ " 'start': 37.0,\n",
515
+ " 'end': 39.0,\n",
516
+ " 'text': ' I mean all the beds, clothes on the camera,',\n",
517
+ " 'tokens': [314, 1612, 477, 262, 20237, 11, 8242, 319, 262, 4676, 11],\n",
518
+ " 'temperature': 0.0,\n",
519
+ " 'avg_logprob': -0.22024561564127604,\n",
520
+ " 'compression_ratio': 1.8056338028169014,\n",
521
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
522
+ " {'id': 22,\n",
523
+ " 'seek': 2900,\n",
524
+ " 'start': 39.0,\n",
525
+ " 'end': 41.0,\n",
526
+ " 'text': ' and it steps together with you like anything.',\n",
527
+ " 'tokens': [290, 340, 4831, 1978, 351, 345, 588, 1997, 13],\n",
528
+ " 'temperature': 0.0,\n",
529
+ " 'avg_logprob': -0.22024561564127604,\n",
530
+ " 'compression_ratio': 1.8056338028169014,\n",
531
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
532
+ " {'id': 23,\n",
533
+ " 'seek': 2900,\n",
534
+ " 'start': 41.0,\n",
535
+ " 'end': 42.0,\n",
536
+ " 'text': ' You want to heal my skin off.',\n",
537
+ " 'tokens': [921, 765, 284, 12035, 616, 4168, 572, 13],\n",
538
+ " 'temperature': 0.0,\n",
539
+ " 'avg_logprob': -0.22024561564127604,\n",
540
+ " 'compression_ratio': 1.8056338028169014,\n",
541
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
542
+ " {'id': 24,\n",
543
+ " 'seek': 2900,\n",
544
+ " 'start': 42.0,\n",
545
+ " 'end': 43.0,\n",
546
+ " 'text': \" You'll never be happy.\",\n",
547
+ " 'tokens': [921, 1183, 1239, 307, 3772, 13],\n",
548
+ " 'temperature': 0.0,\n",
549
+ " 'avg_logprob': -0.22024561564127604,\n",
550
+ " 'compression_ratio': 1.8056338028169014,\n",
551
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
552
+ " {'id': 25,\n",
553
+ " 'seek': 2900,\n",
554
+ " 'start': 43.0,\n",
555
+ " 'end': 44.0,\n",
556
+ " 'text': ' And now lay her anywhere.',\n",
557
+ " 'tokens': [843, 783, 3830, 607, 6609, 13],\n",
558
+ " 'temperature': 0.0,\n",
559
+ " 'avg_logprob': -0.22024561564127604,\n",
560
+ " 'compression_ratio': 1.8056338028169014,\n",
561
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
562
+ " {'id': 26,\n",
563
+ " 'seek': 2900,\n",
564
+ " 'start': 44.0,\n",
565
+ " 'end': 47.0,\n",
566
+ " 'text': \" You'll think you found some better opposite guy than me.\",\n",
567
+ " 'tokens': [921, 1183, 892, 345, 1043, 617, 1365, 6697, 3516, 621, 502, 13],\n",
568
+ " 'temperature': 0.0,\n",
569
+ " 'avg_logprob': -0.22024561564127604,\n",
570
+ " 'compression_ratio': 1.8056338028169014,\n",
571
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
572
+ " {'id': 27,\n",
573
+ " 'seek': 2900,\n",
574
+ " 'start': 47.0,\n",
575
+ " 'end': 51.0,\n",
576
+ " 'text': ' And in a few years, you rebel against him because you need to have your voice.',\n",
577
+ " 'tokens': [843,\n",
578
+ " 287,\n",
579
+ " 257,\n",
580
+ " 1178,\n",
581
+ " 812,\n",
582
+ " 11,\n",
583
+ " 345,\n",
584
+ " 14034,\n",
585
+ " 1028,\n",
586
+ " 683,\n",
587
+ " 780,\n",
588
+ " 345,\n",
589
+ " 761,\n",
590
+ " 284,\n",
591
+ " 423,\n",
592
+ " 534,\n",
593
+ " 3809,\n",
594
+ " 13],\n",
595
+ " 'temperature': 0.0,\n",
596
+ " 'avg_logprob': -0.22024561564127604,\n",
597
+ " 'compression_ratio': 1.8056338028169014,\n",
598
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
599
+ " {'id': 28,\n",
600
+ " 'seek': 2900,\n",
601
+ " 'start': 51.0,\n",
602
+ " 'end': 53.0,\n",
603
+ " 'text': \" But you don't want a voice.\",\n",
604
+ " 'tokens': [887, 345, 836, 470, 765, 257, 3809, 13],\n",
605
+ " 'temperature': 0.0,\n",
606
+ " 'avg_logprob': -0.22024561564127604,\n",
607
+ " 'compression_ratio': 1.8056338028169014,\n",
608
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
609
+ " {'id': 29,\n",
610
+ " 'seek': 2900,\n",
611
+ " 'start': 53.0,\n",
612
+ " 'end': 55.0,\n",
613
+ " 'text': ' You just want to fucking complain about not having a voice.',\n",
614
+ " 'tokens': [921, 655, 765, 284, 9372, 13121, 546, 407, 1719, 257, 3809, 13],\n",
615
+ " 'temperature': 0.0,\n",
616
+ " 'avg_logprob': -0.22024561564127604,\n",
617
+ " 'compression_ratio': 1.8056338028169014,\n",
618
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
619
+ " {'id': 30,\n",
620
+ " 'seek': 2900,\n",
621
+ " 'start': 55.0,\n",
622
+ " 'end': 58.0,\n",
623
+ " 'text': ' I think about being married to you and that woman is a stranger to me.',\n",
624
+ " 'tokens': [314,\n",
625
+ " 892,\n",
626
+ " 546,\n",
627
+ " 852,\n",
628
+ " 6405,\n",
629
+ " 284,\n",
630
+ " 345,\n",
631
+ " 290,\n",
632
+ " 326,\n",
633
+ " 2415,\n",
634
+ " 318,\n",
635
+ " 257,\n",
636
+ " 16195,\n",
637
+ " 284,\n",
638
+ " 502,\n",
639
+ " 13],\n",
640
+ " 'temperature': 0.0,\n",
641
+ " 'avg_logprob': -0.22024561564127604,\n",
642
+ " 'compression_ratio': 1.8056338028169014,\n",
643
+ " 'no_speech_prob': 9.823902473726775e-07},\n",
644
+ " {'id': 31,\n",
645
+ " 'seek': 5800,\n",
646
+ " 'start': 58.0,\n",
647
+ " 'end': 61.0,\n",
648
+ " 'text': ' I mean we had a child and a bitch.',\n",
649
+ " 'tokens': [314, 1612, 356, 550, 257, 1200, 290, 257, 21551, 13],\n",
650
+ " 'temperature': 0.0,\n",
651
+ " 'avg_logprob': -0.12269237166956852,\n",
652
+ " 'compression_ratio': 1.6801346801346801,\n",
653
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
654
+ " {'id': 32,\n",
655
+ " 'seek': 5800,\n",
656
+ " 'start': 61.0,\n",
657
+ " 'end': 63.0,\n",
658
+ " 'text': \" You've gone back to your life before you met me.\",\n",
659
+ " 'tokens': [921, 1053, 3750, 736, 284, 534, 1204, 878, 345, 1138, 502, 13],\n",
660
+ " 'temperature': 0.0,\n",
661
+ " 'avg_logprob': -0.12269237166956852,\n",
662
+ " 'compression_ratio': 1.6801346801346801,\n",
663
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
664
+ " {'id': 33,\n",
665
+ " 'seek': 5800,\n",
666
+ " 'start': 63.0,\n",
667
+ " 'end': 64.0,\n",
668
+ " 'text': \" It's pathetic.\",\n",
669
+ " 'tokens': [632, 338, 29215, 13],\n",
670
+ " 'temperature': 0.0,\n",
671
+ " 'avg_logprob': -0.12269237166956852,\n",
672
+ " 'compression_ratio': 1.6801346801346801,\n",
673
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
674
+ " {'id': 34,\n",
675
+ " 'seek': 5800,\n",
676
+ " 'start': 64.0,\n",
677
+ " 'end': 68.0,\n",
678
+ " 'text': ' People used to tell me that you were too selfish to be a great artist.',\n",
679
+ " 'tokens': [4380,\n",
680
+ " 973,\n",
681
+ " 284,\n",
682
+ " 1560,\n",
683
+ " 502,\n",
684
+ " 326,\n",
685
+ " 345,\n",
686
+ " 547,\n",
687
+ " 1165,\n",
688
+ " 20363,\n",
689
+ " 284,\n",
690
+ " 307,\n",
691
+ " 257,\n",
692
+ " 1049,\n",
693
+ " 6802,\n",
694
+ " 13],\n",
695
+ " 'temperature': 0.0,\n",
696
+ " 'avg_logprob': -0.12269237166956852,\n",
697
+ " 'compression_ratio': 1.6801346801346801,\n",
698
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
699
+ " {'id': 35,\n",
700
+ " 'seek': 5800,\n",
701
+ " 'start': 68.0,\n",
702
+ " 'end': 70.0,\n",
703
+ " 'text': ' And I used to defend you.',\n",
704
+ " 'tokens': [843, 314, 973, 284, 4404, 345, 13],\n",
705
+ " 'temperature': 0.0,\n",
706
+ " 'avg_logprob': -0.12269237166956852,\n",
707
+ " 'compression_ratio': 1.6801346801346801,\n",
708
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
709
+ " {'id': 36,\n",
710
+ " 'seek': 5800,\n",
711
+ " 'start': 70.0,\n",
712
+ " 'end': 71.0,\n",
713
+ " 'text': ' They were absolutely right.',\n",
714
+ " 'tokens': [1119, 547, 5543, 826, 13],\n",
715
+ " 'temperature': 0.0,\n",
716
+ " 'avg_logprob': -0.12269237166956852,\n",
717
+ " 'compression_ratio': 1.6801346801346801,\n",
718
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
719
+ " {'id': 37,\n",
720
+ " 'seek': 5800,\n",
721
+ " 'start': 71.0,\n",
722
+ " 'end': 73.0,\n",
723
+ " 'text': ' All your best acting is behind you.',\n",
724
+ " 'tokens': [1439, 534, 1266, 7205, 318, 2157, 345, 13],\n",
725
+ " 'temperature': 0.0,\n",
726
+ " 'avg_logprob': -0.12269237166956852,\n",
727
+ " 'compression_ratio': 1.6801346801346801,\n",
728
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
729
+ " {'id': 38,\n",
730
+ " 'seek': 5800,\n",
731
+ " 'start': 73.0,\n",
732
+ " 'end': 74.0,\n",
733
+ " 'text': \" You're back to being a hack.\",\n",
734
+ " 'tokens': [921, 821, 736, 284, 852, 257, 8156, 13],\n",
735
+ " 'temperature': 0.0,\n",
736
+ " 'avg_logprob': -0.12269237166956852,\n",
737
+ " 'compression_ratio': 1.6801346801346801,\n",
738
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
739
+ " {'id': 39,\n",
740
+ " 'seek': 5800,\n",
741
+ " 'start': 74.0,\n",
742
+ " 'end': 76.0,\n",
743
+ " 'text': ' You got slighted me.',\n",
744
+ " 'tokens': [921, 1392, 3731, 276, 502, 13],\n",
745
+ " 'temperature': 0.0,\n",
746
+ " 'avg_logprob': -0.12269237166956852,\n",
747
+ " 'compression_ratio': 1.6801346801346801,\n",
748
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
749
+ " {'id': 40,\n",
750
+ " 'seek': 5800,\n",
751
+ " 'start': 76.0,\n",
752
+ " 'end': 77.0,\n",
753
+ " 'text': \" You're a fucking villain.\",\n",
754
+ " 'tokens': [921, 821, 257, 9372, 16687, 13],\n",
755
+ " 'temperature': 0.0,\n",
756
+ " 'avg_logprob': -0.12269237166956852,\n",
757
+ " 'compression_ratio': 1.6801346801346801,\n",
758
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
759
+ " {'id': 41,\n",
760
+ " 'seek': 5800,\n",
761
+ " 'start': 77.0,\n",
762
+ " 'end': 79.0,\n",
763
+ " 'text': ' And you want to present yourself as a victim',\n",
764
+ " 'tokens': [843, 345, 765, 284, 1944, 3511, 355, 257, 3117],\n",
765
+ " 'temperature': 0.0,\n",
766
+ " 'avg_logprob': -0.12269237166956852,\n",
767
+ " 'compression_ratio': 1.6801346801346801,\n",
768
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
769
+ " {'id': 42,\n",
770
+ " 'seek': 5800,\n",
771
+ " 'start': 79.0,\n",
772
+ " 'end': 81.0,\n",
773
+ " 'text': \" because it's a good legal strategy?\",\n",
774
+ " 'tokens': [780, 340, 338, 257, 922, 2742, 4811, 30],\n",
775
+ " 'temperature': 0.0,\n",
776
+ " 'avg_logprob': -0.12269237166956852,\n",
777
+ " 'compression_ratio': 1.6801346801346801,\n",
778
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
779
+ " {'id': 43,\n",
780
+ " 'seek': 5800,\n",
781
+ " 'start': 81.0,\n",
782
+ " 'end': 82.0,\n",
783
+ " 'text': ' Fine.',\n",
784
+ " 'tokens': [17867, 13],\n",
785
+ " 'temperature': 0.0,\n",
786
+ " 'avg_logprob': -0.12269237166956852,\n",
787
+ " 'compression_ratio': 1.6801346801346801,\n",
788
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
789
+ " {'id': 44,\n",
790
+ " 'seek': 5800,\n",
791
+ " 'start': 82.0,\n",
792
+ " 'end': 84.0,\n",
793
+ " 'text': ' But you and I both know you chose this life.',\n",
794
+ " 'tokens': [887, 345, 290, 314, 1111, 760, 345, 7690, 428, 1204, 13],\n",
795
+ " 'temperature': 0.0,\n",
796
+ " 'avg_logprob': -0.12269237166956852,\n",
797
+ " 'compression_ratio': 1.6801346801346801,\n",
798
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
799
+ " {'id': 45,\n",
800
+ " 'seek': 5800,\n",
801
+ " 'start': 84.0,\n",
802
+ " 'end': 87.0,\n",
803
+ " 'text': \" You wanted it until you didn't.\",\n",
804
+ " 'tokens': [921, 2227, 340, 1566, 345, 1422, 470, 13],\n",
805
+ " 'temperature': 0.0,\n",
806
+ " 'avg_logprob': -0.12269237166956852,\n",
807
+ " 'compression_ratio': 1.6801346801346801,\n",
808
+ " 'no_speech_prob': 8.356517469110258e-07},\n",
809
+ " {'id': 46,\n",
810
+ " 'seek': 8700,\n",
811
+ " 'start': 87.0,\n",
812
+ " 'end': 90.0,\n",
813
+ " 'text': ' You used me so you could get out of LA.',\n",
814
+ " 'tokens': [921, 973, 502, 523, 345, 714, 651, 503, 286, 9131, 13],\n",
815
+ " 'temperature': 0.0,\n",
816
+ " 'avg_logprob': -0.08677079749829841,\n",
817
+ " 'compression_ratio': 1.8721804511278195,\n",
818
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
819
+ " {'id': 47,\n",
820
+ " 'seek': 8700,\n",
821
+ " 'start': 90.0,\n",
822
+ " 'end': 91.0,\n",
823
+ " 'text': \" I didn't use you.\",\n",
824
+ " 'tokens': [314, 1422, 470, 779, 345, 13],\n",
825
+ " 'temperature': 0.0,\n",
826
+ " 'avg_logprob': -0.08677079749829841,\n",
827
+ " 'compression_ratio': 1.8721804511278195,\n",
828
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
829
+ " {'id': 48,\n",
830
+ " 'seek': 8700,\n",
831
+ " 'start': 91.0,\n",
832
+ " 'end': 92.0,\n",
833
+ " 'text': ' You did.',\n",
834
+ " 'tokens': [921, 750, 13],\n",
835
+ " 'temperature': 0.0,\n",
836
+ " 'avg_logprob': -0.08677079749829841,\n",
837
+ " 'compression_ratio': 1.8721804511278195,\n",
838
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
839
+ " {'id': 49,\n",
840
+ " 'seek': 8700,\n",
841
+ " 'start': 92.0,\n",
842
+ " 'end': 94.0,\n",
843
+ " 'text': ' And then you blamed me for it.',\n",
844
+ " 'tokens': [843, 788, 345, 13772, 502, 329, 340, 13],\n",
845
+ " 'temperature': 0.0,\n",
846
+ " 'avg_logprob': -0.08677079749829841,\n",
847
+ " 'compression_ratio': 1.8721804511278195,\n",
848
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
849
+ " {'id': 50,\n",
850
+ " 'seek': 8700,\n",
851
+ " 'start': 94.0,\n",
852
+ " 'end': 96.0,\n",
853
+ " 'text': ' You always made me aware of what I was doing wrong.',\n",
854
+ " 'tokens': [921, 1464, 925, 502, 3910, 286, 644, 314, 373, 1804, 2642, 13],\n",
855
+ " 'temperature': 0.0,\n",
856
+ " 'avg_logprob': -0.08677079749829841,\n",
857
+ " 'compression_ratio': 1.8721804511278195,\n",
858
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
859
+ " {'id': 51,\n",
860
+ " 'seek': 8700,\n",
861
+ " 'start': 96.0,\n",
862
+ " 'end': 98.0,\n",
863
+ " 'text': ' How I was falling short.',\n",
864
+ " 'tokens': [1374, 314, 373, 7463, 1790, 13],\n",
865
+ " 'temperature': 0.0,\n",
866
+ " 'avg_logprob': -0.08677079749829841,\n",
867
+ " 'compression_ratio': 1.8721804511278195,\n",
868
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
869
+ " {'id': 52,\n",
870
+ " 'seek': 8700,\n",
871
+ " 'start': 98.0,\n",
872
+ " 'end': 99.0,\n",
873
+ " 'text': ' Life with you was joyless.',\n",
874
+ " 'tokens': [5155, 351, 345, 373, 8716, 1203, 13],\n",
875
+ " 'temperature': 0.0,\n",
876
+ " 'avg_logprob': -0.08677079749829841,\n",
877
+ " 'compression_ratio': 1.8721804511278195,\n",
878
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
879
+ " {'id': 53,\n",
880
+ " 'seek': 8700,\n",
881
+ " 'start': 99.0,\n",
882
+ " 'end': 101.0,\n",
883
+ " 'text': ' Then you had to go and fuck someone.',\n",
884
+ " 'tokens': [3244, 345, 550, 284, 467, 290, 5089, 2130, 13],\n",
885
+ " 'temperature': 0.0,\n",
886
+ " 'avg_logprob': -0.08677079749829841,\n",
887
+ " 'compression_ratio': 1.8721804511278195,\n",
888
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
889
+ " {'id': 54,\n",
890
+ " 'seek': 8700,\n",
891
+ " 'start': 101.0,\n",
892
+ " 'end': 103.0,\n",
893
+ " 'text': ' You should be upset that I fucked her.',\n",
894
+ " 'tokens': [921, 815, 307, 9247, 326, 314, 20654, 607, 13],\n",
895
+ " 'temperature': 0.0,\n",
896
+ " 'avg_logprob': -0.08677079749829841,\n",
897
+ " 'compression_ratio': 1.8721804511278195,\n",
898
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
899
+ " {'id': 55,\n",
900
+ " 'seek': 8700,\n",
901
+ " 'start': 103.0,\n",
902
+ " 'end': 106.0,\n",
903
+ " 'text': ' You should be upset that I had a laugh with her.',\n",
904
+ " 'tokens': [921, 815, 307, 9247, 326, 314, 550, 257, 6487, 351, 607, 13],\n",
905
+ " 'temperature': 0.0,\n",
906
+ " 'avg_logprob': -0.08677079749829841,\n",
907
+ " 'compression_ratio': 1.8721804511278195,\n",
908
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
909
+ " {'id': 56,\n",
910
+ " 'seek': 8700,\n",
911
+ " 'start': 106.0,\n",
912
+ " 'end': 107.0,\n",
913
+ " 'text': ' Do you love her?',\n",
914
+ " 'tokens': [2141, 345, 1842, 607, 30],\n",
915
+ " 'temperature': 0.0,\n",
916
+ " 'avg_logprob': -0.08677079749829841,\n",
917
+ " 'compression_ratio': 1.8721804511278195,\n",
918
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
919
+ " {'id': 57,\n",
920
+ " 'seek': 8700,\n",
921
+ " 'start': 107.0,\n",
922
+ " 'end': 108.0,\n",
923
+ " 'text': ' No.',\n",
924
+ " 'tokens': [1400, 13],\n",
925
+ " 'temperature': 0.0,\n",
926
+ " 'avg_logprob': -0.08677079749829841,\n",
927
+ " 'compression_ratio': 1.8721804511278195,\n",
928
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
929
+ " {'id': 58,\n",
930
+ " 'seek': 8700,\n",
931
+ " 'start': 108.0,\n",
932
+ " 'end': 109.0,\n",
933
+ " 'text': \" But she didn't hate me.\",\n",
934
+ " 'tokens': [887, 673, 1422, 470, 5465, 502, 13],\n",
935
+ " 'temperature': 0.0,\n",
936
+ " 'avg_logprob': -0.08677079749829841,\n",
937
+ " 'compression_ratio': 1.8721804511278195,\n",
938
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
939
+ " {'id': 59,\n",
940
+ " 'seek': 8700,\n",
941
+ " 'start': 109.0,\n",
942
+ " 'end': 110.0,\n",
943
+ " 'text': ' You hated me.',\n",
944
+ " 'tokens': [921, 16563, 502, 13],\n",
945
+ " 'temperature': 0.0,\n",
946
+ " 'avg_logprob': -0.08677079749829841,\n",
947
+ " 'compression_ratio': 1.8721804511278195,\n",
948
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
949
+ " {'id': 60,\n",
950
+ " 'seek': 8700,\n",
951
+ " 'start': 110.0,\n",
952
+ " 'end': 111.0,\n",
953
+ " 'text': ' You hated me.',\n",
954
+ " 'tokens': [921, 16563, 502, 13],\n",
955
+ " 'temperature': 0.0,\n",
956
+ " 'avg_logprob': -0.08677079749829841,\n",
957
+ " 'compression_ratio': 1.8721804511278195,\n",
958
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
959
+ " {'id': 61,\n",
960
+ " 'seek': 8700,\n",
961
+ " 'start': 111.0,\n",
962
+ " 'end': 112.0,\n",
963
+ " 'text': ' You hated me.',\n",
964
+ " 'tokens': [921, 16563, 502, 13],\n",
965
+ " 'temperature': 0.0,\n",
966
+ " 'avg_logprob': -0.08677079749829841,\n",
967
+ " 'compression_ratio': 1.8721804511278195,\n",
968
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
969
+ " {'id': 62,\n",
970
+ " 'seek': 8700,\n",
971
+ " 'start': 112.0,\n",
972
+ " 'end': 113.0,\n",
973
+ " 'text': ' You fucked somebody we worked with.',\n",
974
+ " 'tokens': [921, 20654, 8276, 356, 3111, 351, 13],\n",
975
+ " 'temperature': 0.0,\n",
976
+ " 'avg_logprob': -0.08677079749829841,\n",
977
+ " 'compression_ratio': 1.8721804511278195,\n",
978
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
979
+ " {'id': 63,\n",
980
+ " 'seek': 8700,\n",
981
+ " 'start': 113.0,\n",
982
+ " 'end': 116.0,\n",
983
+ " 'text': ' You stopped having sex with me in the last year.',\n",
984
+ " 'tokens': [921, 5025, 1719, 1714, 351, 502, 287, 262, 938, 614, 13],\n",
985
+ " 'temperature': 0.0,\n",
986
+ " 'avg_logprob': -0.08677079749829841,\n",
987
+ " 'compression_ratio': 1.8721804511278195,\n",
988
+ " 'no_speech_prob': 9.081037433134043e-07},\n",
989
+ " {'id': 64,\n",
990
+ " 'seek': 11600,\n",
991
+ " 'start': 116.0,\n",
992
+ " 'end': 117.0,\n",
993
+ " 'text': ' I never cheated on you.',\n",
994
+ " 'tokens': [314, 1239, 37264, 319, 345, 13],\n",
995
+ " 'temperature': 0.0,\n",
996
+ " 'avg_logprob': -0.11258821849581561,\n",
997
+ " 'compression_ratio': 1.8339622641509434,\n",
998
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
999
+ " {'id': 65,\n",
1000
+ " 'seek': 11600,\n",
1001
+ " 'start': 117.0,\n",
1002
+ " 'end': 119.0,\n",
1003
+ " 'text': ' What was cheating on me?',\n",
1004
+ " 'tokens': [1867, 373, 21608, 319, 502, 30],\n",
1005
+ " 'temperature': 0.0,\n",
1006
+ " 'avg_logprob': -0.11258821849581561,\n",
1007
+ " 'compression_ratio': 1.8339622641509434,\n",
1008
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1009
+ " {'id': 66,\n",
1010
+ " 'seek': 11600,\n",
1011
+ " 'start': 119.0,\n",
1012
+ " 'end': 121.0,\n",
1013
+ " 'text': \" But there's so much I could have done.\",\n",
1014
+ " 'tokens': [887, 612, 338, 523, 881, 314, 714, 423, 1760, 13],\n",
1015
+ " 'temperature': 0.0,\n",
1016
+ " 'avg_logprob': -0.11258821849581561,\n",
1017
+ " 'compression_ratio': 1.8339622641509434,\n",
1018
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1019
+ " {'id': 67,\n",
1020
+ " 'seek': 11600,\n",
1021
+ " 'start': 121.0,\n",
1022
+ " 'end': 123.0,\n",
1023
+ " 'text': ' I was a director in my 20s who came from nothing',\n",
1024
+ " 'tokens': [314, 373, 257, 3437, 287, 616, 1160, 82, 508, 1625, 422, 2147],\n",
1025
+ " 'temperature': 0.0,\n",
1026
+ " 'avg_logprob': -0.11258821849581561,\n",
1027
+ " 'compression_ratio': 1.8339622641509434,\n",
1028
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1029
+ " {'id': 68,\n",
1030
+ " 'seek': 11600,\n",
1031
+ " 'start': 123.0,\n",
1032
+ " 'end': 126.0,\n",
1033
+ " 'text': ' and was suddenly on the cover of fucking Time Out New York.',\n",
1034
+ " 'tokens': [290,\n",
1035
+ " 373,\n",
1036
+ " 6451,\n",
1037
+ " 319,\n",
1038
+ " 262,\n",
1039
+ " 3002,\n",
1040
+ " 286,\n",
1041
+ " 9372,\n",
1042
+ " 3862,\n",
1043
+ " 3806,\n",
1044
+ " 968,\n",
1045
+ " 1971,\n",
1046
+ " 13],\n",
1047
+ " 'temperature': 0.0,\n",
1048
+ " 'avg_logprob': -0.11258821849581561,\n",
1049
+ " 'compression_ratio': 1.8339622641509434,\n",
1050
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1051
+ " {'id': 69,\n",
1052
+ " 'seek': 11600,\n",
1053
+ " 'start': 126.0,\n",
1054
+ " 'end': 129.0,\n",
1055
+ " 'text': \" I was hot shit and I wanted to fuck everybody and I didn't.\",\n",
1056
+ " 'tokens': [314,\n",
1057
+ " 373,\n",
1058
+ " 3024,\n",
1059
+ " 7510,\n",
1060
+ " 290,\n",
1061
+ " 314,\n",
1062
+ " 2227,\n",
1063
+ " 284,\n",
1064
+ " 5089,\n",
1065
+ " 7288,\n",
1066
+ " 290,\n",
1067
+ " 314,\n",
1068
+ " 1422,\n",
1069
+ " 470,\n",
1070
+ " 13],\n",
1071
+ " 'temperature': 0.0,\n",
1072
+ " 'avg_logprob': -0.11258821849581561,\n",
1073
+ " 'compression_ratio': 1.8339622641509434,\n",
1074
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1075
+ " {'id': 70,\n",
1076
+ " 'seek': 11600,\n",
1077
+ " 'start': 129.0,\n",
1078
+ " 'end': 131.0,\n",
1079
+ " 'text': \" And I loved you and I didn't want to lose you.\",\n",
1080
+ " 'tokens': [843,\n",
1081
+ " 314,\n",
1082
+ " 6151,\n",
1083
+ " 345,\n",
1084
+ " 290,\n",
1085
+ " 314,\n",
1086
+ " 1422,\n",
1087
+ " 470,\n",
1088
+ " 765,\n",
1089
+ " 284,\n",
1090
+ " 4425,\n",
1091
+ " 345,\n",
1092
+ " 13],\n",
1093
+ " 'temperature': 0.0,\n",
1094
+ " 'avg_logprob': -0.11258821849581561,\n",
1095
+ " 'compression_ratio': 1.8339622641509434,\n",
1096
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1097
+ " {'id': 71,\n",
1098
+ " 'seek': 11600,\n",
1099
+ " 'start': 131.0,\n",
1100
+ " 'end': 133.0,\n",
1101
+ " 'text': ' But I had made my 20s.',\n",
1102
+ " 'tokens': [887, 314, 550, 925, 616, 1160, 82, 13],\n",
1103
+ " 'temperature': 0.0,\n",
1104
+ " 'avg_logprob': -0.11258821849581561,\n",
1105
+ " 'compression_ratio': 1.8339622641509434,\n",
1106
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1107
+ " {'id': 72,\n",
1108
+ " 'seek': 11600,\n",
1109
+ " 'start': 133.0,\n",
1110
+ " 'end': 135.0,\n",
1111
+ " 'text': \" And I didn't want to lose that too.\",\n",
1112
+ " 'tokens': [843, 314, 1422, 470, 765, 284, 4425, 326, 1165, 13],\n",
1113
+ " 'temperature': 0.0,\n",
1114
+ " 'avg_logprob': -0.11258821849581561,\n",
1115
+ " 'compression_ratio': 1.8339622641509434,\n",
1116
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1117
+ " {'id': 73,\n",
1118
+ " 'seek': 11600,\n",
1119
+ " 'start': 135.0,\n",
1120
+ " 'end': 136.0,\n",
1121
+ " 'text': ' And I kind of did.',\n",
1122
+ " 'tokens': [843, 314, 1611, 286, 750, 13],\n",
1123
+ " 'temperature': 0.0,\n",
1124
+ " 'avg_logprob': -0.11258821849581561,\n",
1125
+ " 'compression_ratio': 1.8339622641509434,\n",
1126
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1127
+ " {'id': 74,\n",
1128
+ " 'seek': 11600,\n",
1129
+ " 'start': 136.0,\n",
1130
+ " 'end': 139.0,\n",
1131
+ " 'text': ' And you wanted so much so fast.',\n",
1132
+ " 'tokens': [843, 345, 2227, 523, 881, 523, 3049, 13],\n",
1133
+ " 'temperature': 0.0,\n",
1134
+ " 'avg_logprob': -0.11258821849581561,\n",
1135
+ " 'compression_ratio': 1.8339622641509434,\n",
1136
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1137
+ " {'id': 75,\n",
1138
+ " 'seek': 11600,\n",
1139
+ " 'start': 139.0,\n",
1140
+ " 'end': 141.0,\n",
1141
+ " 'text': \" I didn't even want to get married.\",\n",
1142
+ " 'tokens': [314, 1422, 470, 772, 765, 284, 651, 6405, 13],\n",
1143
+ " 'temperature': 0.0,\n",
1144
+ " 'avg_logprob': -0.11258821849581561,\n",
1145
+ " 'compression_ratio': 1.8339622641509434,\n",
1146
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1147
+ " {'id': 76,\n",
1148
+ " 'seek': 11600,\n",
1149
+ " 'start': 141.0,\n",
1150
+ " 'end': 142.0,\n",
1151
+ " 'text': ' Fuck it.',\n",
1152
+ " 'tokens': [25617, 340, 13],\n",
1153
+ " 'temperature': 0.0,\n",
1154
+ " 'avg_logprob': -0.11258821849581561,\n",
1155
+ " 'compression_ratio': 1.8339622641509434,\n",
1156
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1157
+ " {'id': 77,\n",
1158
+ " 'seek': 11600,\n",
1159
+ " 'start': 142.0,\n",
1160
+ " 'end': 145.0,\n",
1161
+ " 'text': \" There's so much I didn't do.\",\n",
1162
+ " 'tokens': [1318, 338, 523, 881, 314, 1422, 470, 466, 13],\n",
1163
+ " 'temperature': 0.0,\n",
1164
+ " 'avg_logprob': -0.11258821849581561,\n",
1165
+ " 'compression_ratio': 1.8339622641509434,\n",
1166
+ " 'no_speech_prob': 2.3229101486776926e-07},\n",
1167
+ " {'id': 78,\n",
1168
+ " 'seek': 14500,\n",
1169
+ " 'start': 145.0,\n",
1170
+ " 'end': 147.0,\n",
1171
+ " 'text': ' Thanks for that.',\n",
1172
+ " 'tokens': [6930, 329, 326, 13],\n",
1173
+ " 'temperature': 0.0,\n",
1174
+ " 'avg_logprob': -0.15252724697715359,\n",
1175
+ " 'compression_ratio': 1.5245901639344261,\n",
1176
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1177
+ " {'id': 79,\n",
1178
+ " 'seek': 14500,\n",
1179
+ " 'start': 147.0,\n",
1180
+ " 'end': 148.0,\n",
1181
+ " 'text': \" You're welcome.\",\n",
1182
+ " 'tokens': [921, 821, 7062, 13],\n",
1183
+ " 'temperature': 0.0,\n",
1184
+ " 'avg_logprob': -0.15252724697715359,\n",
1185
+ " 'compression_ratio': 1.5245901639344261,\n",
1186
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1187
+ " {'id': 80,\n",
1188
+ " 'seek': 14500,\n",
1189
+ " 'start': 148.0,\n",
1190
+ " 'end': 151.0,\n",
1191
+ " 'text': \" I can't believe I didn't know you forever.\",\n",
1192
+ " 'tokens': [314, 460, 470, 1975, 314, 1422, 470, 760, 345, 8097, 13],\n",
1193
+ " 'temperature': 0.0,\n",
1194
+ " 'avg_logprob': -0.15252724697715359,\n",
1195
+ " 'compression_ratio': 1.5245901639344261,\n",
1196
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1197
+ " {'id': 81,\n",
1198
+ " 'seek': 14500,\n",
1199
+ " 'start': 151.0,\n",
1200
+ " 'end': 154.0,\n",
1201
+ " 'text': \" You're fucking insane.\",\n",
1202
+ " 'tokens': [921, 821, 9372, 13251, 13],\n",
1203
+ " 'temperature': 0.0,\n",
1204
+ " 'avg_logprob': -0.15252724697715359,\n",
1205
+ " 'compression_ratio': 1.5245901639344261,\n",
1206
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1207
+ " {'id': 82,\n",
1208
+ " 'seek': 14500,\n",
1209
+ " 'start': 154.0,\n",
1210
+ " 'end': 157.0,\n",
1211
+ " 'text': \" And you're fucking winning.\",\n",
1212
+ " 'tokens': [843, 345, 821, 9372, 5442, 13],\n",
1213
+ " 'temperature': 0.0,\n",
1214
+ " 'avg_logprob': -0.15252724697715359,\n",
1215
+ " 'compression_ratio': 1.5245901639344261,\n",
1216
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1217
+ " {'id': 83,\n",
1218
+ " 'seek': 14500,\n",
1219
+ " 'start': 157.0,\n",
1220
+ " 'end': 159.0,\n",
1221
+ " 'text': ' Are you kidding me?',\n",
1222
+ " 'tokens': [4231, 345, 26471, 502, 30],\n",
1223
+ " 'temperature': 0.0,\n",
1224
+ " 'avg_logprob': -0.15252724697715359,\n",
1225
+ " 'compression_ratio': 1.5245901639344261,\n",
1226
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1227
+ " {'id': 84,\n",
1228
+ " 'seek': 14500,\n",
1229
+ " 'start': 159.0,\n",
1230
+ " 'end': 161.0,\n",
1231
+ " 'text': \" I'm wanting to be married.\",\n",
1232
+ " 'tokens': [314, 1101, 10291, 284, 307, 6405, 13],\n",
1233
+ " 'temperature': 0.0,\n",
1234
+ " 'avg_logprob': -0.15252724697715359,\n",
1235
+ " 'compression_ratio': 1.5245901639344261,\n",
1236
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1237
+ " {'id': 85,\n",
1238
+ " 'seek': 14500,\n",
1239
+ " 'start': 161.0,\n",
1240
+ " 'end': 163.0,\n",
1241
+ " 'text': \" I don't already lost.\",\n",
1242
+ " 'tokens': [314, 836, 470, 1541, 2626, 13],\n",
1243
+ " 'temperature': 0.0,\n",
1244
+ " 'avg_logprob': -0.15252724697715359,\n",
1245
+ " 'compression_ratio': 1.5245901639344261,\n",
1246
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1247
+ " {'id': 86,\n",
1248
+ " 'seek': 14500,\n",
1249
+ " 'start': 163.0,\n",
1250
+ " 'end': 167.0,\n",
1251
+ " 'text': \" You didn't love me as much as I loved you.\",\n",
1252
+ " 'tokens': [921, 1422, 470, 1842, 502, 355, 881, 355, 314, 6151, 345, 13],\n",
1253
+ " 'temperature': 0.0,\n",
1254
+ " 'avg_logprob': -0.15252724697715359,\n",
1255
+ " 'compression_ratio': 1.5245901639344261,\n",
1256
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1257
+ " {'id': 87,\n",
1258
+ " 'seek': 14500,\n",
1259
+ " 'start': 167.0,\n",
1260
+ " 'end': 171.0,\n",
1261
+ " 'text': ' What does that have to do with LA?',\n",
1262
+ " 'tokens': [1867, 857, 326, 423, 284, 466, 351, 9131, 30],\n",
1263
+ " 'temperature': 0.0,\n",
1264
+ " 'avg_logprob': -0.15252724697715359,\n",
1265
+ " 'compression_ratio': 1.5245901639344261,\n",
1266
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1267
+ " {'id': 88,\n",
1268
+ " 'seek': 14500,\n",
1269
+ " 'start': 171.0,\n",
1270
+ " 'end': 172.0,\n",
1271
+ " 'text': ' What?',\n",
1272
+ " 'tokens': [1867, 30],\n",
1273
+ " 'temperature': 0.0,\n",
1274
+ " 'avg_logprob': -0.15252724697715359,\n",
1275
+ " 'compression_ratio': 1.5245901639344261,\n",
1276
+ " 'no_speech_prob': 1.9235710624343483e-07},\n",
1277
+ " {'id': 89,\n",
1278
+ " 'seek': 17200,\n",
1279
+ " 'start': 172.0,\n",
1280
+ " 'end': 175.0,\n",
1281
+ " 'text': \" You're so merged with your own selfishness.\",\n",
1282
+ " 'tokens': [921, 821, 523, 23791, 351, 534, 898, 20363, 1108, 13],\n",
1283
+ " 'temperature': 0.0,\n",
1284
+ " 'avg_logprob': -0.2508346222259186,\n",
1285
+ " 'compression_ratio': 1.4583333333333333,\n",
1286
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1287
+ " {'id': 90,\n",
1288
+ " 'seek': 17200,\n",
1289
+ " 'start': 175.0,\n",
1290
+ " 'end': 178.0,\n",
1291
+ " 'text': \" You don't need to identify it and selfishness anymore.\",\n",
1292
+ " 'tokens': [921, 836, 470, 761, 284, 5911, 340, 290, 20363, 1108, 7471, 13],\n",
1293
+ " 'temperature': 0.0,\n",
1294
+ " 'avg_logprob': -0.2508346222259186,\n",
1295
+ " 'compression_ratio': 1.4583333333333333,\n",
1296
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1297
+ " {'id': 91,\n",
1298
+ " 'seek': 17200,\n",
1299
+ " 'start': 178.0,\n",
1300
+ " 'end': 181.0,\n",
1301
+ " 'text': \" You're such a dick.\",\n",
1302
+ " 'tokens': [921, 821, 884, 257, 19317, 13],\n",
1303
+ " 'temperature': 0.0,\n",
1304
+ " 'avg_logprob': -0.2508346222259186,\n",
1305
+ " 'compression_ratio': 1.4583333333333333,\n",
1306
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1307
+ " {'id': 92,\n",
1308
+ " 'seek': 17200,\n",
1309
+ " 'start': 181.0,\n",
1310
+ " 'end': 184.0,\n",
1311
+ " 'text': \" Every day I wake up and I hope you're dead.\",\n",
1312
+ " 'tokens': [3887, 1110, 314, 7765, 510, 290, 314, 2911, 345, 821, 2636, 13],\n",
1313
+ " 'temperature': 0.0,\n",
1314
+ " 'avg_logprob': -0.2508346222259186,\n",
1315
+ " 'compression_ratio': 1.4583333333333333,\n",
1316
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1317
+ " {'id': 93,\n",
1318
+ " 'seek': 17200,\n",
1319
+ " 'start': 184.0,\n",
1320
+ " 'end': 185.0,\n",
1321
+ " 'text': ' Dead like it.',\n",
1322
+ " 'tokens': [5542, 588, 340, 13],\n",
1323
+ " 'temperature': 0.0,\n",
1324
+ " 'avg_logprob': -0.2508346222259186,\n",
1325
+ " 'compression_ratio': 1.4583333333333333,\n",
1326
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1327
+ " {'id': 94,\n",
1328
+ " 'seek': 17200,\n",
1329
+ " 'start': 185.0,\n",
1330
+ " 'end': 187.0,\n",
1331
+ " 'text': ' If I can guarantee every movie, okay?',\n",
1332
+ " 'tokens': [1002, 314, 460, 9149, 790, 3807, 11, 8788, 30],\n",
1333
+ " 'temperature': 0.0,\n",
1334
+ " 'avg_logprob': -0.2508346222259186,\n",
1335
+ " 'compression_ratio': 1.4583333333333333,\n",
1336
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1337
+ " {'id': 95,\n",
1338
+ " 'seek': 17200,\n",
1339
+ " 'start': 187.0,\n",
1340
+ " 'end': 189.0,\n",
1341
+ " 'text': \" I don't think I'm gonna kill this.\",\n",
1342
+ " 'tokens': [314, 836, 470, 892, 314, 1101, 8066, 1494, 428, 13],\n",
1343
+ " 'temperature': 0.0,\n",
1344
+ " 'avg_logprob': -0.2508346222259186,\n",
1345
+ " 'compression_ratio': 1.4583333333333333,\n",
1346
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1347
+ " {'id': 96,\n",
1348
+ " 'seek': 17200,\n",
1349
+ " 'start': 189.0,\n",
1350
+ " 'end': 191.0,\n",
1351
+ " 'text': ' As I can hit by a car and die.',\n",
1352
+ " 'tokens': [1081, 314, 460, 2277, 416, 257, 1097, 290, 4656, 13],\n",
1353
+ " 'temperature': 0.0,\n",
1354
+ " 'avg_logprob': -0.2508346222259186,\n",
1355
+ " 'compression_ratio': 1.4583333333333333,\n",
1356
+ " 'no_speech_prob': 3.485897934751847e-07},\n",
1357
+ " {'id': 97,\n",
1358
+ " 'seek': 19100,\n",
1359
+ " 'start': 191.0,\n",
1360
+ " 'end': 196.0,\n",
1361
+ " 'text': \" You don't\",\n",
1362
+ " 'tokens': [921, 836, 470],\n",
1363
+ " 'temperature': 1.0,\n",
1364
+ " 'avg_logprob': -2.0730719472847734,\n",
1365
+ " 'compression_ratio': 1.3488372093023255,\n",
1366
+ " 'no_speech_prob': 1.8071212252834812e-05},\n",
1367
+ " {'id': 98,\n",
1368
+ " 'seek': 19100,\n",
1369
+ " 'start': 196.0,\n",
1370
+ " 'end': 201.0,\n",
1371
+ " 'text': \" work with me once you're left alone.\",\n",
1372
+ " 'tokens': [670, 351, 502, 1752, 345, 821, 1364, 3436, 13],\n",
1373
+ " 'temperature': 1.0,\n",
1374
+ " 'avg_logprob': -2.0730719472847734,\n",
1375
+ " 'compression_ratio': 1.3488372093023255,\n",
1376
+ " 'no_speech_prob': 1.8071212252834812e-05},\n",
1377
+ " {'id': 99,\n",
1378
+ " 'seek': 19100,\n",
1379
+ " 'start': 201.0,\n",
1380
+ " 'end': 205.0,\n",
1381
+ " 'text': ' I lost you and you 2, 3 4',\n",
1382
+ " 'tokens': [314, 2626, 345, 290, 345, 362, 11, 513, 604],\n",
1383
+ " 'temperature': 1.0,\n",
1384
+ " 'avg_logprob': -2.0730719472847734,\n",
1385
+ " 'compression_ratio': 1.3488372093023255,\n",
1386
+ " 'no_speech_prob': 1.8071212252834812e-05},\n",
1387
+ " {'id': 100,\n",
1388
+ " 'seek': 19100,\n",
1389
+ " 'start': 205.0,\n",
1390
+ " 'end': 208.0,\n",
1391
+ " 'text': ' I lost you.',\n",
1392
+ " 'tokens': [314, 2626, 345, 13],\n",
1393
+ " 'temperature': 1.0,\n",
1394
+ " 'avg_logprob': -2.0730719472847734,\n",
1395
+ " 'compression_ratio': 1.3488372093023255,\n",
1396
+ " 'no_speech_prob': 1.8071212252834812e-05},\n",
1397
+ " {'id': 101,\n",
1398
+ " 'seek': 19100,\n",
1399
+ " 'start': 208.0,\n",
1400
+ " 'end': 211.0,\n",
1401
+ " 'text': ' I lost you, 2 4',\n",
1402
+ " 'tokens': [314, 2626, 345, 11, 362, 604],\n",
1403
+ " 'temperature': 1.0,\n",
1404
+ " 'avg_logprob': -2.0730719472847734,\n",
1405
+ " 'compression_ratio': 1.3488372093023255,\n",
1406
+ " 'no_speech_prob': 1.8071212252834812e-05},\n",
1407
+ " {'id': 102,\n",
1408
+ " 'seek': 19100,\n",
1409
+ " 'start': 211.0,\n",
1410
+ " 'end': 214.0,\n",
1411
+ " 'text': ' I lost you, 2 5',\n",
1412
+ " 'tokens': [314, 2626, 345, 11, 362, 642],\n",
1413
+ " 'temperature': 1.0,\n",
1414
+ " 'avg_logprob': -2.0730719472847734,\n",
1415
+ " 'compression_ratio': 1.3488372093023255,\n",
1416
+ " 'no_speech_prob': 1.8071212252834812e-05},\n",
1417
+ " {'id': 103,\n",
1418
+ " 'seek': 21400,\n",
1419
+ " 'start': 214.0,\n",
1420
+ " 'end': 218.88,\n",
1421
+ " 'text': \" I'm sorry.\",\n",
1422
+ " 'tokens': [314, 1101, 7926, 13],\n",
1423
+ " 'temperature': 1.0,\n",
1424
+ " 'avg_logprob': -2.222068927906178,\n",
1425
+ " 'compression_ratio': 1.0,\n",
1426
+ " 'no_speech_prob': 0.04370500147342682},\n",
1427
+ " {'id': 104,\n",
1428
+ " 'seek': 21400,\n",
1429
+ " 'start': 218.88,\n",
1430
+ " 'end': 220.54,\n",
1431
+ " 'text': ' Sheila.',\n",
1432
+ " 'tokens': [49627, 13],\n",
1433
+ " 'temperature': 1.0,\n",
1434
+ " 'avg_logprob': -2.222068927906178,\n",
1435
+ " 'compression_ratio': 1.0,\n",
1436
+ " 'no_speech_prob': 0.04370500147342682},\n",
1437
+ " {'id': 105,\n",
1438
+ " 'seek': 21400,\n",
1439
+ " 'start': 220.54,\n",
1440
+ " 'end': 224.66,\n",
1441
+ " 'text': \" I'm sorry.\",\n",
1442
+ " 'tokens': [314, 1101, 7926, 13],\n",
1443
+ " 'temperature': 1.0,\n",
1444
+ " 'avg_logprob': -2.222068927906178,\n",
1445
+ " 'compression_ratio': 1.0,\n",
1446
+ " 'no_speech_prob': 0.04370500147342682},\n",
1447
+ " {'id': 106,\n",
1448
+ " 'seek': 21400,\n",
1449
+ " 'start': 224.66,\n",
1450
+ " 'end': 234.78,\n",
1451
+ " 'text': ' Did you know?',\n",
1452
+ " 'tokens': [7731, 345, 760, 30],\n",
1453
+ " 'temperature': 1.0,\n",
1454
+ " 'avg_logprob': -2.222068927906178,\n",
1455
+ " 'compression_ratio': 1.0,\n",
1456
+ " 'no_speech_prob': 0.04370500147342682},\n",
1457
+ " {'id': 107,\n",
1458
+ " 'seek': 23478,\n",
1459
+ " 'start': 234.78,\n",
1460
+ " 'end': 236.78,\n",
1461
+ " 'text': ' You',\n",
1462
+ " 'tokens': [50363, 921, 50463],\n",
1463
+ " 'temperature': 0.0,\n",
1464
+ " 'avg_logprob': -0.9333540797233582,\n",
1465
+ " 'compression_ratio': 0.2727272727272727,\n",
1466
+ " 'no_speech_prob': 0.5948350429534912}]"
1467
+ ]
1468
+ },
1469
+ "execution_count": 17,
1470
+ "metadata": {},
1471
+ "output_type": "execute_result"
1472
+ }
1473
+ ],
1474
+ "source": [
1475
+ "segments"
1476
+ ]
1477
+ },
1478
+ {
1479
+ "cell_type": "markdown",
1480
+ "id": "23f31008",
1481
+ "metadata": {},
1482
+ "source": [
1483
+ "## Know duration"
1484
+ ]
1485
+ },
1486
+ {
1487
+ "cell_type": "code",
1488
+ "execution_count": 18,
1489
+ "id": "5f0fe450",
1490
+ "metadata": {},
1491
+ "outputs": [],
1492
+ "source": [
1493
+ "import wave\n",
1494
+ "import contextlib"
1495
+ ]
1496
+ },
1497
+ {
1498
+ "cell_type": "code",
1499
+ "execution_count": 19,
1500
+ "id": "1015519a",
1501
+ "metadata": {},
1502
+ "outputs": [
1503
+ {
1504
+ "name": "stdout",
1505
+ "output_type": "stream",
1506
+ "text": [
1507
+ "conversion to wav ready, duration of audio file: 258.93\n"
1508
+ ]
1509
+ }
1510
+ ],
1511
+ "source": [
1512
+ "# Get duration\n",
1513
+ "with contextlib.closing(wave.open(audio_file,'r')) as f:\n",
1514
+ " frames = f.getnframes()\n",
1515
+ " rate = f.getframerate()\n",
1516
+ " duration = frames / float(rate)\n",
1517
+ "print(f\"conversion to wav ready, duration of audio file: {duration}\")"
1518
+ ]
1519
+ },
1520
+ {
1521
+ "cell_type": "markdown",
1522
+ "id": "d54e4a6f",
1523
+ "metadata": {},
1524
+ "source": [
1525
+ "## Speaker diarization"
1526
+ ]
1527
+ },
1528
+ {
1529
+ "cell_type": "code",
1530
+ "execution_count": 1,
1531
+ "id": "4f978ac0",
1532
+ "metadata": {},
1533
+ "outputs": [
1534
+ {
1535
+ "name": "stderr",
1536
+ "output_type": "stream",
1537
+ "text": [
1538
+ "/home/codespace/.local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
1539
+ " from .autonotebook import tqdm as notebook_tqdm\n"
1540
+ ]
1541
+ }
1542
+ ],
1543
+ "source": [
1544
+ "import pyannote.audio\n",
1545
+ "from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding\n",
1546
+ "from pyannote.audio import Audio\n",
1547
+ "from pyannote.core import Segment\n",
1548
+ "from tqdm.auto import tqdm\n",
1549
+ "import numpy as np\n",
1550
+ "import torch"
1551
+ ]
1552
+ },
1553
+ {
1554
+ "cell_type": "code",
1555
+ "execution_count": null,
1556
+ "id": "3f9a28ca",
1557
+ "metadata": {},
1558
+ "outputs": [
1559
+ {
1560
+ "ename": "OSError",
1561
+ "evalue": "[WinError 1314] Le client ne dispose pas d’un privilège nécessaire: 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\huggingface\\\\hub\\\\models--speechbrain--spkrec-ecapa-voxceleb\\\\snapshots\\\\5c0be3875fda05e81f3c004ed8c7c06be308de1e\\\\hyperparams.yaml' -> 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\torch\\\\pyannote\\\\speechbrain\\\\hyperparams.yaml'",
1562
+ "output_type": "error",
1563
+ "traceback": [
1564
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1565
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
1566
+ "Cell \u001b[1;32mIn[30], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m embedding_model \u001b[38;5;241m=\u001b[39m \u001b[43mPretrainedSpeakerEmbedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mspeechbrain/spkrec-ecapa-voxceleb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
1567
+ "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\pyannote\\audio\\pipelines\\speaker_verification.py:463\u001b[0m, in \u001b[0;36mPretrainedSpeakerEmbedding\u001b[1;34m(embedding, device, use_auth_token)\u001b[0m\n\u001b[0;32m 431\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Pretrained speaker embedding\u001b[39;00m\n\u001b[0;32m 432\u001b[0m \n\u001b[0;32m 433\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 459\u001b[0m \u001b[38;5;124;03m>>> embeddings = get_embedding(waveforms, masks=masks)\u001b[39;00m\n\u001b[0;32m 460\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 462\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(embedding, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mspeechbrain\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m embedding:\n\u001b[1;32m--> 463\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSpeechBrainPretrainedSpeakerEmbedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 464\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\n\u001b[0;32m 465\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 467\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(embedding, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnvidia\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m embedding:\n\u001b[0;32m 468\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m NeMoPretrainedSpeakerEmbedding(embedding, device\u001b[38;5;241m=\u001b[39mdevice)\n",
1568
+ "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\pyannote\\audio\\pipelines\\speaker_verification.py:242\u001b[0m, in \u001b[0;36mSpeechBrainPretrainedSpeakerEmbedding.__init__\u001b[1;34m(self, embedding, device, use_auth_token)\u001b[0m\n\u001b[0;32m 239\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding \u001b[38;5;241m=\u001b[39m embedding\n\u001b[0;32m 240\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice \u001b[38;5;241m=\u001b[39m device\n\u001b[1;32m--> 242\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclassifier_ \u001b[38;5;241m=\u001b[39m \u001b[43mSpeechBrain_EncoderClassifier\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_hparams\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 243\u001b[0m \u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[43msavedir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mCACHE_DIR\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m/speechbrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 245\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_opts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdevice\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 246\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 247\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
1569
+ "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\speechbrain\\pretrained\\interfaces.py:342\u001b[0m, in \u001b[0;36mPretrained.from_hparams\u001b[1;34m(cls, source, hparams_file, pymodule_file, overrides, savedir, use_auth_token, revision, **kwargs)\u001b[0m\n\u001b[0;32m 340\u001b[0m clsname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[0;32m 341\u001b[0m savedir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./pretrained_models/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mclsname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhashlib\u001b[38;5;241m.\u001b[39mmd5(source\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUTF-8\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;250m \u001b[39merrors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreplace\u001b[39m\u001b[38;5;124m'\u001b[39m))\u001b[38;5;241m.\u001b[39mhexdigest()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 342\u001b[0m hparams_local_path \u001b[38;5;241m=\u001b[39m \u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 343\u001b[0m \u001b[43m \u001b[49m\u001b[43mhparams_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msavedir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\n\u001b[0;32m 344\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 345\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 346\u001b[0m pymodule_local_path \u001b[38;5;241m=\u001b[39m fetch(\n\u001b[0;32m 347\u001b[0m pymodule_file, source, savedir, use_auth_token, revision\n\u001b[0;32m 348\u001b[0m )\n",
1570
+ "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\speechbrain\\pretrained\\fetching.py:135\u001b[0m, in \u001b[0;36mfetch\u001b[1;34m(filename, source, savedir, overwrite, save_filename, use_auth_token, revision)\u001b[0m\n\u001b[0;32m 133\u001b[0m sourcepath \u001b[38;5;241m=\u001b[39m pathlib\u001b[38;5;241m.\u001b[39mPath(fetched_file)\u001b[38;5;241m.\u001b[39mabsolute()\n\u001b[0;32m 134\u001b[0m _missing_ok_unlink(destination)\n\u001b[1;32m--> 135\u001b[0m \u001b[43mdestination\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msymlink_to\u001b[49m\u001b[43m(\u001b[49m\u001b[43msourcepath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m destination\n",
1571
+ "File \u001b[1;32m~\\Anaconda3\\lib\\pathlib.py:1391\u001b[0m, in \u001b[0;36mPath.symlink_to\u001b[1;34m(self, target, target_is_directory)\u001b[0m\n\u001b[0;32m 1389\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_closed:\n\u001b[0;32m 1390\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_closed()\n\u001b[1;32m-> 1391\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_accessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msymlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_is_directory\u001b[49m\u001b[43m)\u001b[49m\n",
1572
+ "\u001b[1;31mOSError\u001b[0m: [WinError 1314] Le client ne dispose pas d’un privilège nécessaire: 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\huggingface\\\\hub\\\\models--speechbrain--spkrec-ecapa-voxceleb\\\\snapshots\\\\5c0be3875fda05e81f3c004ed8c7c06be308de1e\\\\hyperparams.yaml' -> 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\torch\\\\pyannote\\\\speechbrain\\\\hyperparams.yaml'"
1573
+ ]
1574
+ }
1575
+ ],
1576
+ "source": [
1577
+ "embedding_model = PretrainedSpeakerEmbedding( \n",
1578
+ " \"speechbrain/spkrec-ecapa-voxceleb\",\n",
1579
+ " device=\"cpu\")"
1580
+ ]
1581
+ },
1582
+ {
1583
+ "cell_type": "code",
1584
+ "execution_count": 25,
1585
+ "id": "42b9dc6a",
1586
+ "metadata": {},
1587
+ "outputs": [],
1588
+ "source": [
1589
+ "def segment_embedding(segment):\n",
1590
+ " audio = Audio()\n",
1591
+ " start = segment[\"start\"]\n",
1592
+ " # Whisper overshoots the end timestamp in the last segment\n",
1593
+ " end = min(duration, segment[\"end\"])\n",
1594
+ " clip = Segment(start, end)\n",
1595
+ " waveform, sample_rate = audio.crop(audio_file, clip)\n",
1596
+ " return embedding_model(waveform[None])"
1597
+ ]
1598
+ },
1599
+ {
1600
+ "cell_type": "code",
1601
+ "execution_count": 26,
1602
+ "id": "6acb732e",
1603
+ "metadata": {},
1604
+ "outputs": [
1605
+ {
1606
+ "data": {
1607
+ "application/vnd.jupyter.widget-view+json": {
1608
+ "model_id": "d0408c2dfc644f52832b1978b9967944",
1609
+ "version_major": 2,
1610
+ "version_minor": 0
1611
+ },
1612
+ "text/plain": [
1613
+ " 0%| | 0/101 [00:00<?, ?it/s]"
1614
+ ]
1615
+ },
1616
+ "metadata": {},
1617
+ "output_type": "display_data"
1618
+ },
1619
+ {
1620
+ "ename": "NameError",
1621
+ "evalue": "name 'embedding_model' is not defined",
1622
+ "output_type": "error",
1623
+ "traceback": [
1624
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1625
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
1626
+ "Cell \u001b[1;32mIn[26], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(shape\u001b[38;5;241m=\u001b[39m(\u001b[38;5;28mlen\u001b[39m(segments), \u001b[38;5;241m192\u001b[39m))\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, segment \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(tqdm(segments)):\n\u001b[1;32m----> 3\u001b[0m embeddings[i] \u001b[38;5;241m=\u001b[39m \u001b[43msegment_embedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43msegment\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mnan_to_num(embeddings)\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mEmbedding shape: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00membeddings\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
1627
+ "Cell \u001b[1;32mIn[25], line 8\u001b[0m, in \u001b[0;36msegment_embedding\u001b[1;34m(segment)\u001b[0m\n\u001b[0;32m 6\u001b[0m clip \u001b[38;5;241m=\u001b[39m Segment(start, end)\n\u001b[0;32m 7\u001b[0m waveform, sample_rate \u001b[38;5;241m=\u001b[39m audio\u001b[38;5;241m.\u001b[39mcrop(audio_file, clip)\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43membedding_model\u001b[49m(waveform[\u001b[38;5;28;01mNone\u001b[39;00m])\n",
1628
+ "\u001b[1;31mNameError\u001b[0m: name 'embedding_model' is not defined"
1629
+ ]
1630
+ }
1631
+ ],
1632
+ "source": [
1633
+ "embeddings = np.zeros(shape=(len(segments), 192))\n",
1634
+ "for i, segment in enumerate(tqdm(segments)):\n",
1635
+ " embeddings[i] = segment_embedding(segment)\n",
1636
+ "embeddings = np.nan_to_num(embeddings)\n",
1637
+ "print(f'Embedding shape: {embeddings.shape}')"
1638
+ ]
1639
+ },
1640
+ {
1641
+ "cell_type": "code",
1642
+ "execution_count": null,
1643
+ "id": "537ea5b1",
1644
+ "metadata": {},
1645
+ "outputs": [],
1646
+ "source": []
1647
+ }
1648
+ ],
1649
+ "metadata": {
1650
+ "kernelspec": {
1651
+ "display_name": "Python 3",
1652
+ "language": "python",
1653
+ "name": "python3"
1654
+ },
1655
+ "language_info": {
1656
+ "codemirror_mode": {
1657
+ "name": "ipython",
1658
+ "version": 3
1659
+ },
1660
+ "file_extension": ".py",
1661
+ "mimetype": "text/x-python",
1662
+ "name": "python",
1663
+ "nbconvert_exporter": "python",
1664
+ "pygments_lexer": "ipython3",
1665
+ "version": "3.10.4"
1666
+ },
1667
+ "vscode": {
1668
+ "interpreter": {
1669
+ "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
1670
+ }
1671
+ }
1672
+ },
1673
+ "nbformat": 4,
1674
+ "nbformat_minor": 5
1675
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/pyannote/pyannote-audio
3
+ git+https://github.com/openai/whisper.git
4
+ gradio==3.12
5
+ ffmpeg-python
6
+ pandas==1.5.0
7
+ pytube==12.1.0
8
+ sacremoses
9
+ sentencepiece
10
+ tokenizers
11
+ torch
12
+ torchaudio
13
+ tqdm==4.64.1
14
+ EasyNMT==2.0.2
15
+ nltk
16
+ transformers
17
+ pysrt
18
+ psutil==5.9.2
19
+ requests
20
+ gpuinfo
21
+ moviepy