Spaces:

DataForGood
/

bechdelai-demo

Build error

App Files Files Community

TheoLvs commited on Feb 26, 2023

Commit

85d8489

1 Parent(s): 10b3d4f

First commit

Browse files

Files changed (11) hide show

.github/workflows/check.yml +16 -0
.github/workflows/main.yml +20 -0
.gitignore +3 -0
README.md +1 -1
app.py +180 -0
bechdelaidemo/__init_.py +0 -0
bechdelaidemo/utils.py +86 -0
logo.png +0 -0
notebooks/20230225 - Demo ODI.ipynb +1675 -0
packages.txt +1 -0
requirements.txt +21 -0

.github/workflows/check.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Check file size
+on:               # or directly `on: [push]` to run the action on every push on any branch
+  pull_request:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/lfs-warning@v2.0
+        with:
+          filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://dataforgood:$HF_TOKEN@huggingface.co/spaces/dataforgood/bechdelai-demo main

.gitignore CHANGED Viewed

@@ -6,6 +6,9 @@ __pycache__/
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/

 # C extensions
 *.so
+*.wav
+*.mp4
 # Distribution / packaging
 .Python
 build/

README.md CHANGED Viewed

	@@ -1 +1 @@
1	- # bechdelai-demo


1	+ # bechdelai-tool-demo

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Inspired from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization/blob/main/app.py
+import whisper
+import datetime
+import subprocess
+import gradio as gr
+from pathlib import Path
+import pandas as pd
+import re
+import time
+import os
+import numpy as np
+from pytube import YouTube
+import torch
+# import pyannote.audio
+# from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+# from pyannote.audio import Audio
+# from pyannote.core import Segment
+# from sklearn.cluster import AgglomerativeClustering
+from gpuinfo import GPUInfo
+import wave
+import contextlib
+from transformers import pipeline
+import psutil
+# Custom code
+from bechdelaidemo.utils import download_youtube_video
+from bechdelaidemo.utils import extract_audio_from_movie
+# Constants
+whisper_models = ["tiny.en","base.en","tiny","base", "small", "medium", "large"]
+device = 0 if torch.cuda.is_available() else "cpu"
+os.makedirs('output', exist_ok=True)
+# Prepare embedding model
+# embedding_model = PretrainedSpeakerEmbedding(
+#     "speechbrain/spkrec-ecapa-voxceleb",
+#     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+def get_youtube(video_url):
+    yt = YouTube(video_url)
+    abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
+    print("Success download video")
+    print(abs_video_path)
+    return abs_video_path
+def _return_yt_html_embed(yt_url):
+    video_id = yt_url.split("?v=")[-1]
+    HTML_str = (
+        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
+        " </center>"
+    )
+    return HTML_str
+def speech_to_text(video_filepath, selected_source_lang = "en", whisper_model = "tiny.en"):
+    """
+    # Transcribe youtube link using OpenAI Whisper
+    1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
+    2. Generating speaker embeddings for each segments.
+    3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
+    Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
+    Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
+    """
+    time_start = time.time()
+    # Convert video to audio
+    audio_filepath = extract_audio_from_movie(video_filepath,".wav")
+    # Load whisper
+    model = whisper.load_model(whisper_model)
+    # Get duration
+    with contextlib.closing(wave.open(audio_filepath,'r')) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate)
+    print(f"conversion to wav ready, duration of audio file: {duration}")
+    # Transcribe audio
+    options = dict(language=selected_source_lang, beam_size=5, best_of=5)
+    transcribe_options = dict(task="transcribe", **options)
+    result = model.transcribe(audio_filepath, **transcribe_options)
+    segments = result["segments"]
+    text = result["text"].strip()
+    print("starting whisper done with whisper")
+    return [text]
+source_language_list = ["en","fr"]
+# ---- Gradio Layout -----
+# Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
+video_in = gr.Video(label="Video file", mirror_webcam=False)
+youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
+selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
+selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="tiny.en", label="Selected Whisper model", interactive=True)
+# transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
+output_text = gr.Textbox(label = "Transcribed text",lines = 10)
+title = "BechdelAI - demo"
+demo = gr.Blocks(title=title,live = True)
+demo.encrypt = False
+with demo:
+    with gr.Tab("BechdelAI - dialogue demo"):
+        gr.Markdown('''
+            <div>
+                <h1 style='text-align: center'>BechdelAI - Dialogue demo</h1>
+                # <img src="logo.png" style="width:200px"/>
+            </div>
+        ''')
+        with gr.Row():
+            gr.Markdown('''
+            ### Transcribe youtube link using OpenAI Whisper
+            ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
+            ##### 2. Generating speaker embeddings for each segments.
+            ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
+            ''')
+        with gr.Row():
+            with gr.Column():
+                # gr.Markdown('''### You can test by following examples:''')
+                examples = gr.Examples(examples=
+                        [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
+                        "https://www.youtube.com/watch?v=-UX0X45sYe4",
+                        "https://www.youtube.com/watch?v=7minSgqi-Gw"],
+                    label="Examples", inputs=[youtube_url_in])
+            with gr.Column():
+                youtube_url_in.render()
+                download_youtube_btn = gr.Button("Download Youtube video")
+                download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                    video_in])
+                print(video_in)
+            with gr.Column():
+                youtube_url_in.render()
+                download_youtube_btn = gr.Button("Download Youtube video")
+                download_youtube_btn.click(get_youtube, [youtube_url_in], [
+                    video_in])
+                print(video_in)
+            with gr.Column():
+                video_in.render()
+        with gr.Row():
+            with gr.Column():
+                with gr.Column():
+                    gr.Markdown('''
+                    ##### Here you can start the transcription process.
+                    ##### Please select the source language for transcription.
+                    ##### You should select a number of speakers for getting better results.
+                    ''')
+                selected_source_lang.render()
+                selected_whisper_model.render()
+                transcribe_btn = gr.Button("Transcribe audio and diarization")
+                transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], [output_text])
+        # with gr.Row():
+        #     gr.Markdown('''
+        #     ##### Here you will get transcription  output
+        #     ##### ''')
+        with gr.Row():
+            with gr.Column():
+                output_text.render()
+demo.launch(debug=True)

bechdelaidemo/__init_.py ADDED Viewed

File without changes

bechdelaidemo/utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from pytube import YouTube
+import moviepy.editor as mp
+def download_youtube_video(link: str, filename: str, caption_language: str = "en") -> None:
+    """Download a youtube video with captions given an id
+    Parameters
+    ----------
+    link : str
+        Youtube video link
+    filename : str
+        File name to save the video and the caption
+    caption_language : str
+        Language caption to download
+    Raises
+    ------
+    TypeError
+        url must be a string
+    ValueError
+        url must start with 'http'
+    """
+    try:
+        yt = YouTube(link)
+    except:
+        print("Connection Error")
+        return
+    filename = filename if filename.endswith(".mp4") else filename + ".mp4"
+    try:
+        (
+            yt.streams.filter(progressive=True, file_extension="mp4")
+            .order_by("resolution")
+            .desc()
+            .first()
+        ).download(filename=filename)
+    except Exception as e:
+        print("Could not download the video!", e)
+    try:
+        captions = {
+            k: v
+            for k, v in yt.captions.lang_code_index.items()
+            if caption_language in k
+        }
+        for lang, caption in captions.items():
+            caption.download(title=f"caption_{lang}", srt=False)
+    except Exception as e:
+        print("Could not download the caption!", e)
+    print("Task Completed!")
+def download_youtube_audio(link:str,filename:str = "audio.mp3") -> str:
+    yt = YouTube(link)
+    stream = yt.streams.filter(only_audio=True)[0]
+    stream.download(filename=filename)
+    return filename
+def import_as_clip(path_to_video: str) -> mp.VideoFileClip:
+    """Imports a video file as a VideoFileClip object.
+    Parameters:
+        path_to_video (str): Path to a video file.
+    Returns:
+        mp.VideoFileClip: VideoFileClip object.
+    """
+    return mp.VideoFileClip(path_to_video)
+def extract_audio_from_movie(file: str, extension: str = '.wav') -> None:
+    """Extract the audio from a film and save it to a file.
+    The audio is saved in the same directory as the film.
+    Parameters:
+        file (str): The name of the film file to extract the audio from.
+        extension (str): The file extension of the audio file to save (default is ".wav").
+    """
+    clip = import_as_clip(file)
+    filename = file.split(sep='.')[0] + extension
+    clip.audio.write_audiofile(filename)
+    return filename

logo.png ADDED Viewed

notebooks/20230225 - Demo ODI.ipynb ADDED Viewed

	@@ -0,0 +1,1675 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5813b894",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "# For developers who want to use the latest development version or the library locally\n",
+    "# Use poetry to install dependencies\n",
+    "import sys\n",
+    "sys.path.append(\"../\") # Or change to the folder to the direction of \n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import bechdelaidemo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6521fddd",
+   "metadata": {},
+   "source": [
+    "# Bechdel test on a dialogue scene\n",
+    "The test is already done on a dialogue scene. So easier, because we don't have to extract dialogue segments from a video"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfa5ac0e",
+   "metadata": {},
+   "source": [
+    "## Test on 2 dialogue scenes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b657fa08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path1 = \"https://www.youtube.com/watch?v=b2f2Kqt_KcE&ab_channel=Movieclips\" #Devils wears prada\n",
+    "path2 = \"https://www.youtube.com/watch?v=FDFdroN7d0w\" #Marriage story"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe733fa4",
+   "metadata": {},
+   "source": [
+    "## Download videos and audios from Youtube\n",
+    "Inspiration from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization/blob/main/app.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ac260c66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bechdelaidemo.utils import download_youtube_video"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5aa1ce3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Task Completed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "download_youtube_video(path2,\"video.mp4\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d78d3671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_path = \"video.mp4\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "999e8c78",
+   "metadata": {},
+   "source": [
+    "## Convert video to audio wav file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "766a386d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bechdelaidemo.utils import extract_audio_from_movie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8f5028ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MoviePy - Writing audio in video.wav\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                      "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MoviePy - Done.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r"
+     ]
+    }
+   ],
+   "source": [
+    "extract_audio_from_movie(video_path,\".wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9af64b12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio_file = \"video.wav\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88a8f8f1",
+   "metadata": {},
+   "source": [
+    "## Clean audio\n",
+    "- https://huggingface.co/speechbrain/sepformer-wham16k-enhancement\n",
+    "- Spleeter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f74de64",
+   "metadata": {},
+   "source": [
+    "## Extract info using Whisper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "eb33487c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import whisper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "12588878",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 90.2MiB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = whisper.load_model(\"tiny.en\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "59386a8d",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'task': 'transcribe', 'language': 'en', 'beam_size': 5, 'best_of': 5}\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Transcribe audio\n",
+    "options = dict(language=\"en\", beam_size=5, best_of=5)\n",
+    "transcribe_options = dict(task=\"transcribe\", **options)\n",
+    "\n",
+    "print(transcribe_options)\n",
+    "\n",
+    "result = model.transcribe(\"video.wav\", **transcribe_options)\n",
+    "segments = result[\"segments\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "4a2357b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\" You're being so much like your father. Do not compare me to my father. I didn't compare you, Dan. I said you were acting like him. You're exactly like your mother. Everything you're complaining about her you're doing. You're suffocating in me. First of all, I love my mother. She was a wonderful mother. Just repeating what you told me. Secondly, how dare you compare my mother into my mother? I'm maybe like my father, but I am not like my mother. You are! And you're like my father. You're also like my mother. You're all the bad things about all of these people. But mostly your mother. When we would lie in bed together, sometimes I would look at you and see her and just feel so gross. I felt appalled when you touched her. You're a slob. I mean all the beds, clothes on the camera, and it steps together with you like anything. You want to heal my skin off. You'll never be happy. And now lay her anywhere. You'll think you found some better opposite guy than me. And in a few years, you rebel against him because you need to have your voice. But you don't want a voice. You just want to fucking complain about not having a voice. I think about being married to you and that woman is a stranger to me. I mean we had a child and a bitch. You've gone back to your life before you met me. It's pathetic. People used to tell me that you were too selfish to be a great artist. And I used to defend you. They were absolutely right. All your best acting is behind you. You're back to being a hack. You got slighted me. You're a fucking villain. And you want to present yourself as a victim because it's a good legal strategy? Fine. But you and I both know you chose this life. You wanted it until you didn't. You used me so you could get out of LA. I didn't use you. You did. And then you blamed me for it. You always made me aware of what I was doing wrong. How I was falling short. Life with you was joyless. Then you had to go and fuck someone. You should be upset that I fucked her. You should be upset that I had a laugh with her. Do you love her? No. But she didn't hate me. You hated me. You hated me. You hated me. You fucked somebody we worked with. You stopped having sex with me in the last year. I never cheated on you. What was cheating on me? But there's so much I could have done. I was a director in my 20s who came from nothing and was suddenly on the cover of fucking Time Out New York. I was hot shit and I wanted to fuck everybody and I didn't. And I loved you and I didn't want to lose you. But I had made my 20s. And I didn't want to lose that too. And I kind of did. And you wanted so much so fast. I didn't even want to get married. Fuck it. There's so much I didn't do. Thanks for that. You're welcome. I can't believe I didn't know you forever. You're fucking insane. And you're fucking winning. Are you kidding me? I'm wanting to be married. I don't already lost. You didn't love me as much as I loved you. What does that have to do with LA? What? You're so merged with your own selfishness. You don't need to identify it and selfishness anymore. You're such a dick. Every day I wake up and I hope you're dead. Dead like it. If I can guarantee every movie, okay? I don't think I'm gonna kill this. As I can hit by a car and die. You don't work with me once you're left alone. I lost you and you 2, 3 4 I lost you. I lost you, 2 4 I lost you, 2 5 I'm sorry. Sheila. I'm sorry. Did you know? You\""
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result[\"text\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "26fcf349",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'id': 0,\n",
+       "  'seek': 0,\n",
+       "  'start': 0.0,\n",
+       "  'end': 3.0,\n",
+       "  'text': \" You're being so much like your father.\",\n",
+       "  'tokens': [921, 821, 852, 523, 881, 588, 534, 2988, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 1,\n",
+       "  'seek': 0,\n",
+       "  'start': 3.0,\n",
+       "  'end': 5.0,\n",
+       "  'text': ' Do not compare me to my father.',\n",
+       "  'tokens': [2141, 407, 8996, 502, 284, 616, 2988, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 2,\n",
+       "  'seek': 0,\n",
+       "  'start': 5.0,\n",
+       "  'end': 6.0,\n",
+       "  'text': \" I didn't compare you, Dan.\",\n",
+       "  'tokens': [314, 1422, 470, 8996, 345, 11, 6035, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 3,\n",
+       "  'seek': 0,\n",
+       "  'start': 6.0,\n",
+       "  'end': 7.0,\n",
+       "  'text': ' I said you were acting like him.',\n",
+       "  'tokens': [314, 531, 345, 547, 7205, 588, 683, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 4,\n",
+       "  'seek': 0,\n",
+       "  'start': 7.0,\n",
+       "  'end': 8.0,\n",
+       "  'text': \" You're exactly like your mother.\",\n",
+       "  'tokens': [921, 821, 3446, 588, 534, 2802, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 5,\n",
+       "  'seek': 0,\n",
+       "  'start': 8.0,\n",
+       "  'end': 10.0,\n",
+       "  'text': \" Everything you're complaining about her you're doing.\",\n",
+       "  'tokens': [11391, 345, 821, 18705, 546, 607, 345, 821, 1804, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 6,\n",
+       "  'seek': 0,\n",
+       "  'start': 10.0,\n",
+       "  'end': 12.0,\n",
+       "  'text': \" You're suffocating in me.\",\n",
+       "  'tokens': [921, 821, 3027, 27123, 287, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 7,\n",
+       "  'seek': 0,\n",
+       "  'start': 12.0,\n",
+       "  'end': 13.0,\n",
+       "  'text': ' First of all, I love my mother.',\n",
+       "  'tokens': [3274, 286, 477, 11, 314, 1842, 616, 2802, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 8,\n",
+       "  'seek': 0,\n",
+       "  'start': 13.0,\n",
+       "  'end': 15.0,\n",
+       "  'text': ' She was a wonderful mother.',\n",
+       "  'tokens': [1375, 373, 257, 7932, 2802, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 9,\n",
+       "  'seek': 0,\n",
+       "  'start': 15.0,\n",
+       "  'end': 16.0,\n",
+       "  'text': ' Just repeating what you told me.',\n",
+       "  'tokens': [2329, 20394, 644, 345, 1297, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 10,\n",
+       "  'seek': 0,\n",
+       "  'start': 16.0,\n",
+       "  'end': 19.0,\n",
+       "  'text': ' Secondly, how dare you compare my mother into my mother?',\n",
+       "  'tokens': [34276, 11, 703, 16498, 345, 8996, 616, 2802, 656, 616, 2802, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 11,\n",
+       "  'seek': 0,\n",
+       "  'start': 19.0,\n",
+       "  'end': 22.0,\n",
+       "  'text': \" I'm maybe like my father, but I am not like my mother.\",\n",
+       "  'tokens': [314,\n",
+       "   1101,\n",
+       "   3863,\n",
+       "   588,\n",
+       "   616,\n",
+       "   2988,\n",
+       "   11,\n",
+       "   475,\n",
+       "   314,\n",
+       "   716,\n",
+       "   407,\n",
+       "   588,\n",
+       "   616,\n",
+       "   2802,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 12,\n",
+       "  'seek': 0,\n",
+       "  'start': 22.0,\n",
+       "  'end': 23.0,\n",
+       "  'text': ' You are!',\n",
+       "  'tokens': [921, 389, 0],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 13,\n",
+       "  'seek': 0,\n",
+       "  'start': 23.0,\n",
+       "  'end': 25.0,\n",
+       "  'text': \" And you're like my father.\",\n",
+       "  'tokens': [843, 345, 821, 588, 616, 2988, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 14,\n",
+       "  'seek': 0,\n",
+       "  'start': 25.0,\n",
+       "  'end': 26.0,\n",
+       "  'text': \" You're also like my mother.\",\n",
+       "  'tokens': [921, 821, 635, 588, 616, 2802, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 15,\n",
+       "  'seek': 0,\n",
+       "  'start': 26.0,\n",
+       "  'end': 29.0,\n",
+       "  'text': \" You're all the bad things about all of these people.\",\n",
+       "  'tokens': [921, 821, 477, 262, 2089, 1243, 546, 477, 286, 777, 661, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15723278113370817,\n",
+       "  'compression_ratio': 1.9415807560137457,\n",
+       "  'no_speech_prob': 0.16063228249549866},\n",
+       " {'id': 16,\n",
+       "  'seek': 2900,\n",
+       "  'start': 29.0,\n",
+       "  'end': 30.0,\n",
+       "  'text': ' But mostly your mother.',\n",
+       "  'tokens': [887, 4632, 534, 2802, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 17,\n",
+       "  'seek': 2900,\n",
+       "  'start': 30.0,\n",
+       "  'end': 32.0,\n",
+       "  'text': ' When we would lie in bed together, sometimes I would look at you',\n",
+       "  'tokens': [1649,\n",
+       "   356,\n",
+       "   561,\n",
+       "   6486,\n",
+       "   287,\n",
+       "   3996,\n",
+       "   1978,\n",
+       "   11,\n",
+       "   3360,\n",
+       "   314,\n",
+       "   561,\n",
+       "   804,\n",
+       "   379,\n",
+       "   345],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 18,\n",
+       "  'seek': 2900,\n",
+       "  'start': 32.0,\n",
+       "  'end': 34.0,\n",
+       "  'text': ' and see her and just feel so gross.',\n",
+       "  'tokens': [290, 766, 607, 290, 655, 1254, 523, 10319, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 19,\n",
+       "  'seek': 2900,\n",
+       "  'start': 34.0,\n",
+       "  'end': 36.0,\n",
+       "  'text': ' I felt appalled when you touched her.',\n",
+       "  'tokens': [314, 2936, 41586, 618, 345, 12615, 607, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 20,\n",
+       "  'seek': 2900,\n",
+       "  'start': 36.0,\n",
+       "  'end': 37.0,\n",
+       "  'text': \" You're a slob.\",\n",
+       "  'tokens': [921, 821, 257, 1017, 672, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 21,\n",
+       "  'seek': 2900,\n",
+       "  'start': 37.0,\n",
+       "  'end': 39.0,\n",
+       "  'text': ' I mean all the beds, clothes on the camera,',\n",
+       "  'tokens': [314, 1612, 477, 262, 20237, 11, 8242, 319, 262, 4676, 11],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 22,\n",
+       "  'seek': 2900,\n",
+       "  'start': 39.0,\n",
+       "  'end': 41.0,\n",
+       "  'text': ' and it steps together with you like anything.',\n",
+       "  'tokens': [290, 340, 4831, 1978, 351, 345, 588, 1997, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 23,\n",
+       "  'seek': 2900,\n",
+       "  'start': 41.0,\n",
+       "  'end': 42.0,\n",
+       "  'text': ' You want to heal my skin off.',\n",
+       "  'tokens': [921, 765, 284, 12035, 616, 4168, 572, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 24,\n",
+       "  'seek': 2900,\n",
+       "  'start': 42.0,\n",
+       "  'end': 43.0,\n",
+       "  'text': \" You'll never be happy.\",\n",
+       "  'tokens': [921, 1183, 1239, 307, 3772, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 25,\n",
+       "  'seek': 2900,\n",
+       "  'start': 43.0,\n",
+       "  'end': 44.0,\n",
+       "  'text': ' And now lay her anywhere.',\n",
+       "  'tokens': [843, 783, 3830, 607, 6609, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 26,\n",
+       "  'seek': 2900,\n",
+       "  'start': 44.0,\n",
+       "  'end': 47.0,\n",
+       "  'text': \" You'll think you found some better opposite guy than me.\",\n",
+       "  'tokens': [921, 1183, 892, 345, 1043, 617, 1365, 6697, 3516, 621, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 27,\n",
+       "  'seek': 2900,\n",
+       "  'start': 47.0,\n",
+       "  'end': 51.0,\n",
+       "  'text': ' And in a few years, you rebel against him because you need to have your voice.',\n",
+       "  'tokens': [843,\n",
+       "   287,\n",
+       "   257,\n",
+       "   1178,\n",
+       "   812,\n",
+       "   11,\n",
+       "   345,\n",
+       "   14034,\n",
+       "   1028,\n",
+       "   683,\n",
+       "   780,\n",
+       "   345,\n",
+       "   761,\n",
+       "   284,\n",
+       "   423,\n",
+       "   534,\n",
+       "   3809,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 28,\n",
+       "  'seek': 2900,\n",
+       "  'start': 51.0,\n",
+       "  'end': 53.0,\n",
+       "  'text': \" But you don't want a voice.\",\n",
+       "  'tokens': [887, 345, 836, 470, 765, 257, 3809, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 29,\n",
+       "  'seek': 2900,\n",
+       "  'start': 53.0,\n",
+       "  'end': 55.0,\n",
+       "  'text': ' You just want to fucking complain about not having a voice.',\n",
+       "  'tokens': [921, 655, 765, 284, 9372, 13121, 546, 407, 1719, 257, 3809, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 30,\n",
+       "  'seek': 2900,\n",
+       "  'start': 55.0,\n",
+       "  'end': 58.0,\n",
+       "  'text': ' I think about being married to you and that woman is a stranger to me.',\n",
+       "  'tokens': [314,\n",
+       "   892,\n",
+       "   546,\n",
+       "   852,\n",
+       "   6405,\n",
+       "   284,\n",
+       "   345,\n",
+       "   290,\n",
+       "   326,\n",
+       "   2415,\n",
+       "   318,\n",
+       "   257,\n",
+       "   16195,\n",
+       "   284,\n",
+       "   502,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.22024561564127604,\n",
+       "  'compression_ratio': 1.8056338028169014,\n",
+       "  'no_speech_prob': 9.823902473726775e-07},\n",
+       " {'id': 31,\n",
+       "  'seek': 5800,\n",
+       "  'start': 58.0,\n",
+       "  'end': 61.0,\n",
+       "  'text': ' I mean we had a child and a bitch.',\n",
+       "  'tokens': [314, 1612, 356, 550, 257, 1200, 290, 257, 21551, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 32,\n",
+       "  'seek': 5800,\n",
+       "  'start': 61.0,\n",
+       "  'end': 63.0,\n",
+       "  'text': \" You've gone back to your life before you met me.\",\n",
+       "  'tokens': [921, 1053, 3750, 736, 284, 534, 1204, 878, 345, 1138, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 33,\n",
+       "  'seek': 5800,\n",
+       "  'start': 63.0,\n",
+       "  'end': 64.0,\n",
+       "  'text': \" It's pathetic.\",\n",
+       "  'tokens': [632, 338, 29215, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 34,\n",
+       "  'seek': 5800,\n",
+       "  'start': 64.0,\n",
+       "  'end': 68.0,\n",
+       "  'text': ' People used to tell me that you were too selfish to be a great artist.',\n",
+       "  'tokens': [4380,\n",
+       "   973,\n",
+       "   284,\n",
+       "   1560,\n",
+       "   502,\n",
+       "   326,\n",
+       "   345,\n",
+       "   547,\n",
+       "   1165,\n",
+       "   20363,\n",
+       "   284,\n",
+       "   307,\n",
+       "   257,\n",
+       "   1049,\n",
+       "   6802,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 35,\n",
+       "  'seek': 5800,\n",
+       "  'start': 68.0,\n",
+       "  'end': 70.0,\n",
+       "  'text': ' And I used to defend you.',\n",
+       "  'tokens': [843, 314, 973, 284, 4404, 345, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 36,\n",
+       "  'seek': 5800,\n",
+       "  'start': 70.0,\n",
+       "  'end': 71.0,\n",
+       "  'text': ' They were absolutely right.',\n",
+       "  'tokens': [1119, 547, 5543, 826, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 37,\n",
+       "  'seek': 5800,\n",
+       "  'start': 71.0,\n",
+       "  'end': 73.0,\n",
+       "  'text': ' All your best acting is behind you.',\n",
+       "  'tokens': [1439, 534, 1266, 7205, 318, 2157, 345, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 38,\n",
+       "  'seek': 5800,\n",
+       "  'start': 73.0,\n",
+       "  'end': 74.0,\n",
+       "  'text': \" You're back to being a hack.\",\n",
+       "  'tokens': [921, 821, 736, 284, 852, 257, 8156, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 39,\n",
+       "  'seek': 5800,\n",
+       "  'start': 74.0,\n",
+       "  'end': 76.0,\n",
+       "  'text': ' You got slighted me.',\n",
+       "  'tokens': [921, 1392, 3731, 276, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 40,\n",
+       "  'seek': 5800,\n",
+       "  'start': 76.0,\n",
+       "  'end': 77.0,\n",
+       "  'text': \" You're a fucking villain.\",\n",
+       "  'tokens': [921, 821, 257, 9372, 16687, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 41,\n",
+       "  'seek': 5800,\n",
+       "  'start': 77.0,\n",
+       "  'end': 79.0,\n",
+       "  'text': ' And you want to present yourself as a victim',\n",
+       "  'tokens': [843, 345, 765, 284, 1944, 3511, 355, 257, 3117],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 42,\n",
+       "  'seek': 5800,\n",
+       "  'start': 79.0,\n",
+       "  'end': 81.0,\n",
+       "  'text': \" because it's a good legal strategy?\",\n",
+       "  'tokens': [780, 340, 338, 257, 922, 2742, 4811, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 43,\n",
+       "  'seek': 5800,\n",
+       "  'start': 81.0,\n",
+       "  'end': 82.0,\n",
+       "  'text': ' Fine.',\n",
+       "  'tokens': [17867, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 44,\n",
+       "  'seek': 5800,\n",
+       "  'start': 82.0,\n",
+       "  'end': 84.0,\n",
+       "  'text': ' But you and I both know you chose this life.',\n",
+       "  'tokens': [887, 345, 290, 314, 1111, 760, 345, 7690, 428, 1204, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 45,\n",
+       "  'seek': 5800,\n",
+       "  'start': 84.0,\n",
+       "  'end': 87.0,\n",
+       "  'text': \" You wanted it until you didn't.\",\n",
+       "  'tokens': [921, 2227, 340, 1566, 345, 1422, 470, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.12269237166956852,\n",
+       "  'compression_ratio': 1.6801346801346801,\n",
+       "  'no_speech_prob': 8.356517469110258e-07},\n",
+       " {'id': 46,\n",
+       "  'seek': 8700,\n",
+       "  'start': 87.0,\n",
+       "  'end': 90.0,\n",
+       "  'text': ' You used me so you could get out of LA.',\n",
+       "  'tokens': [921, 973, 502, 523, 345, 714, 651, 503, 286, 9131, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 47,\n",
+       "  'seek': 8700,\n",
+       "  'start': 90.0,\n",
+       "  'end': 91.0,\n",
+       "  'text': \" I didn't use you.\",\n",
+       "  'tokens': [314, 1422, 470, 779, 345, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 48,\n",
+       "  'seek': 8700,\n",
+       "  'start': 91.0,\n",
+       "  'end': 92.0,\n",
+       "  'text': ' You did.',\n",
+       "  'tokens': [921, 750, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 49,\n",
+       "  'seek': 8700,\n",
+       "  'start': 92.0,\n",
+       "  'end': 94.0,\n",
+       "  'text': ' And then you blamed me for it.',\n",
+       "  'tokens': [843, 788, 345, 13772, 502, 329, 340, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 50,\n",
+       "  'seek': 8700,\n",
+       "  'start': 94.0,\n",
+       "  'end': 96.0,\n",
+       "  'text': ' You always made me aware of what I was doing wrong.',\n",
+       "  'tokens': [921, 1464, 925, 502, 3910, 286, 644, 314, 373, 1804, 2642, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 51,\n",
+       "  'seek': 8700,\n",
+       "  'start': 96.0,\n",
+       "  'end': 98.0,\n",
+       "  'text': ' How I was falling short.',\n",
+       "  'tokens': [1374, 314, 373, 7463, 1790, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 52,\n",
+       "  'seek': 8700,\n",
+       "  'start': 98.0,\n",
+       "  'end': 99.0,\n",
+       "  'text': ' Life with you was joyless.',\n",
+       "  'tokens': [5155, 351, 345, 373, 8716, 1203, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 53,\n",
+       "  'seek': 8700,\n",
+       "  'start': 99.0,\n",
+       "  'end': 101.0,\n",
+       "  'text': ' Then you had to go and fuck someone.',\n",
+       "  'tokens': [3244, 345, 550, 284, 467, 290, 5089, 2130, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 54,\n",
+       "  'seek': 8700,\n",
+       "  'start': 101.0,\n",
+       "  'end': 103.0,\n",
+       "  'text': ' You should be upset that I fucked her.',\n",
+       "  'tokens': [921, 815, 307, 9247, 326, 314, 20654, 607, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 55,\n",
+       "  'seek': 8700,\n",
+       "  'start': 103.0,\n",
+       "  'end': 106.0,\n",
+       "  'text': ' You should be upset that I had a laugh with her.',\n",
+       "  'tokens': [921, 815, 307, 9247, 326, 314, 550, 257, 6487, 351, 607, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 56,\n",
+       "  'seek': 8700,\n",
+       "  'start': 106.0,\n",
+       "  'end': 107.0,\n",
+       "  'text': ' Do you love her?',\n",
+       "  'tokens': [2141, 345, 1842, 607, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 57,\n",
+       "  'seek': 8700,\n",
+       "  'start': 107.0,\n",
+       "  'end': 108.0,\n",
+       "  'text': ' No.',\n",
+       "  'tokens': [1400, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 58,\n",
+       "  'seek': 8700,\n",
+       "  'start': 108.0,\n",
+       "  'end': 109.0,\n",
+       "  'text': \" But she didn't hate me.\",\n",
+       "  'tokens': [887, 673, 1422, 470, 5465, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 59,\n",
+       "  'seek': 8700,\n",
+       "  'start': 109.0,\n",
+       "  'end': 110.0,\n",
+       "  'text': ' You hated me.',\n",
+       "  'tokens': [921, 16563, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 60,\n",
+       "  'seek': 8700,\n",
+       "  'start': 110.0,\n",
+       "  'end': 111.0,\n",
+       "  'text': ' You hated me.',\n",
+       "  'tokens': [921, 16563, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 61,\n",
+       "  'seek': 8700,\n",
+       "  'start': 111.0,\n",
+       "  'end': 112.0,\n",
+       "  'text': ' You hated me.',\n",
+       "  'tokens': [921, 16563, 502, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 62,\n",
+       "  'seek': 8700,\n",
+       "  'start': 112.0,\n",
+       "  'end': 113.0,\n",
+       "  'text': ' You fucked somebody we worked with.',\n",
+       "  'tokens': [921, 20654, 8276, 356, 3111, 351, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 63,\n",
+       "  'seek': 8700,\n",
+       "  'start': 113.0,\n",
+       "  'end': 116.0,\n",
+       "  'text': ' You stopped having sex with me in the last year.',\n",
+       "  'tokens': [921, 5025, 1719, 1714, 351, 502, 287, 262, 938, 614, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.08677079749829841,\n",
+       "  'compression_ratio': 1.8721804511278195,\n",
+       "  'no_speech_prob': 9.081037433134043e-07},\n",
+       " {'id': 64,\n",
+       "  'seek': 11600,\n",
+       "  'start': 116.0,\n",
+       "  'end': 117.0,\n",
+       "  'text': ' I never cheated on you.',\n",
+       "  'tokens': [314, 1239, 37264, 319, 345, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 65,\n",
+       "  'seek': 11600,\n",
+       "  'start': 117.0,\n",
+       "  'end': 119.0,\n",
+       "  'text': ' What was cheating on me?',\n",
+       "  'tokens': [1867, 373, 21608, 319, 502, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 66,\n",
+       "  'seek': 11600,\n",
+       "  'start': 119.0,\n",
+       "  'end': 121.0,\n",
+       "  'text': \" But there's so much I could have done.\",\n",
+       "  'tokens': [887, 612, 338, 523, 881, 314, 714, 423, 1760, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 67,\n",
+       "  'seek': 11600,\n",
+       "  'start': 121.0,\n",
+       "  'end': 123.0,\n",
+       "  'text': ' I was a director in my 20s who came from nothing',\n",
+       "  'tokens': [314, 373, 257, 3437, 287, 616, 1160, 82, 508, 1625, 422, 2147],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 68,\n",
+       "  'seek': 11600,\n",
+       "  'start': 123.0,\n",
+       "  'end': 126.0,\n",
+       "  'text': ' and was suddenly on the cover of fucking Time Out New York.',\n",
+       "  'tokens': [290,\n",
+       "   373,\n",
+       "   6451,\n",
+       "   319,\n",
+       "   262,\n",
+       "   3002,\n",
+       "   286,\n",
+       "   9372,\n",
+       "   3862,\n",
+       "   3806,\n",
+       "   968,\n",
+       "   1971,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 69,\n",
+       "  'seek': 11600,\n",
+       "  'start': 126.0,\n",
+       "  'end': 129.0,\n",
+       "  'text': \" I was hot shit and I wanted to fuck everybody and I didn't.\",\n",
+       "  'tokens': [314,\n",
+       "   373,\n",
+       "   3024,\n",
+       "   7510,\n",
+       "   290,\n",
+       "   314,\n",
+       "   2227,\n",
+       "   284,\n",
+       "   5089,\n",
+       "   7288,\n",
+       "   290,\n",
+       "   314,\n",
+       "   1422,\n",
+       "   470,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 70,\n",
+       "  'seek': 11600,\n",
+       "  'start': 129.0,\n",
+       "  'end': 131.0,\n",
+       "  'text': \" And I loved you and I didn't want to lose you.\",\n",
+       "  'tokens': [843,\n",
+       "   314,\n",
+       "   6151,\n",
+       "   345,\n",
+       "   290,\n",
+       "   314,\n",
+       "   1422,\n",
+       "   470,\n",
+       "   765,\n",
+       "   284,\n",
+       "   4425,\n",
+       "   345,\n",
+       "   13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 71,\n",
+       "  'seek': 11600,\n",
+       "  'start': 131.0,\n",
+       "  'end': 133.0,\n",
+       "  'text': ' But I had made my 20s.',\n",
+       "  'tokens': [887, 314, 550, 925, 616, 1160, 82, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 72,\n",
+       "  'seek': 11600,\n",
+       "  'start': 133.0,\n",
+       "  'end': 135.0,\n",
+       "  'text': \" And I didn't want to lose that too.\",\n",
+       "  'tokens': [843, 314, 1422, 470, 765, 284, 4425, 326, 1165, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 73,\n",
+       "  'seek': 11600,\n",
+       "  'start': 135.0,\n",
+       "  'end': 136.0,\n",
+       "  'text': ' And I kind of did.',\n",
+       "  'tokens': [843, 314, 1611, 286, 750, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 74,\n",
+       "  'seek': 11600,\n",
+       "  'start': 136.0,\n",
+       "  'end': 139.0,\n",
+       "  'text': ' And you wanted so much so fast.',\n",
+       "  'tokens': [843, 345, 2227, 523, 881, 523, 3049, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 75,\n",
+       "  'seek': 11600,\n",
+       "  'start': 139.0,\n",
+       "  'end': 141.0,\n",
+       "  'text': \" I didn't even want to get married.\",\n",
+       "  'tokens': [314, 1422, 470, 772, 765, 284, 651, 6405, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 76,\n",
+       "  'seek': 11600,\n",
+       "  'start': 141.0,\n",
+       "  'end': 142.0,\n",
+       "  'text': ' Fuck it.',\n",
+       "  'tokens': [25617, 340, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 77,\n",
+       "  'seek': 11600,\n",
+       "  'start': 142.0,\n",
+       "  'end': 145.0,\n",
+       "  'text': \" There's so much I didn't do.\",\n",
+       "  'tokens': [1318, 338, 523, 881, 314, 1422, 470, 466, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.11258821849581561,\n",
+       "  'compression_ratio': 1.8339622641509434,\n",
+       "  'no_speech_prob': 2.3229101486776926e-07},\n",
+       " {'id': 78,\n",
+       "  'seek': 14500,\n",
+       "  'start': 145.0,\n",
+       "  'end': 147.0,\n",
+       "  'text': ' Thanks for that.',\n",
+       "  'tokens': [6930, 329, 326, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 79,\n",
+       "  'seek': 14500,\n",
+       "  'start': 147.0,\n",
+       "  'end': 148.0,\n",
+       "  'text': \" You're welcome.\",\n",
+       "  'tokens': [921, 821, 7062, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 80,\n",
+       "  'seek': 14500,\n",
+       "  'start': 148.0,\n",
+       "  'end': 151.0,\n",
+       "  'text': \" I can't believe I didn't know you forever.\",\n",
+       "  'tokens': [314, 460, 470, 1975, 314, 1422, 470, 760, 345, 8097, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 81,\n",
+       "  'seek': 14500,\n",
+       "  'start': 151.0,\n",
+       "  'end': 154.0,\n",
+       "  'text': \" You're fucking insane.\",\n",
+       "  'tokens': [921, 821, 9372, 13251, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 82,\n",
+       "  'seek': 14500,\n",
+       "  'start': 154.0,\n",
+       "  'end': 157.0,\n",
+       "  'text': \" And you're fucking winning.\",\n",
+       "  'tokens': [843, 345, 821, 9372, 5442, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 83,\n",
+       "  'seek': 14500,\n",
+       "  'start': 157.0,\n",
+       "  'end': 159.0,\n",
+       "  'text': ' Are you kidding me?',\n",
+       "  'tokens': [4231, 345, 26471, 502, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 84,\n",
+       "  'seek': 14500,\n",
+       "  'start': 159.0,\n",
+       "  'end': 161.0,\n",
+       "  'text': \" I'm wanting to be married.\",\n",
+       "  'tokens': [314, 1101, 10291, 284, 307, 6405, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 85,\n",
+       "  'seek': 14500,\n",
+       "  'start': 161.0,\n",
+       "  'end': 163.0,\n",
+       "  'text': \" I don't already lost.\",\n",
+       "  'tokens': [314, 836, 470, 1541, 2626, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 86,\n",
+       "  'seek': 14500,\n",
+       "  'start': 163.0,\n",
+       "  'end': 167.0,\n",
+       "  'text': \" You didn't love me as much as I loved you.\",\n",
+       "  'tokens': [921, 1422, 470, 1842, 502, 355, 881, 355, 314, 6151, 345, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 87,\n",
+       "  'seek': 14500,\n",
+       "  'start': 167.0,\n",
+       "  'end': 171.0,\n",
+       "  'text': ' What does that have to do with LA?',\n",
+       "  'tokens': [1867, 857, 326, 423, 284, 466, 351, 9131, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 88,\n",
+       "  'seek': 14500,\n",
+       "  'start': 171.0,\n",
+       "  'end': 172.0,\n",
+       "  'text': ' What?',\n",
+       "  'tokens': [1867, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.15252724697715359,\n",
+       "  'compression_ratio': 1.5245901639344261,\n",
+       "  'no_speech_prob': 1.9235710624343483e-07},\n",
+       " {'id': 89,\n",
+       "  'seek': 17200,\n",
+       "  'start': 172.0,\n",
+       "  'end': 175.0,\n",
+       "  'text': \" You're so merged with your own selfishness.\",\n",
+       "  'tokens': [921, 821, 523, 23791, 351, 534, 898, 20363, 1108, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 90,\n",
+       "  'seek': 17200,\n",
+       "  'start': 175.0,\n",
+       "  'end': 178.0,\n",
+       "  'text': \" You don't need to identify it and selfishness anymore.\",\n",
+       "  'tokens': [921, 836, 470, 761, 284, 5911, 340, 290, 20363, 1108, 7471, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 91,\n",
+       "  'seek': 17200,\n",
+       "  'start': 178.0,\n",
+       "  'end': 181.0,\n",
+       "  'text': \" You're such a dick.\",\n",
+       "  'tokens': [921, 821, 884, 257, 19317, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 92,\n",
+       "  'seek': 17200,\n",
+       "  'start': 181.0,\n",
+       "  'end': 184.0,\n",
+       "  'text': \" Every day I wake up and I hope you're dead.\",\n",
+       "  'tokens': [3887, 1110, 314, 7765, 510, 290, 314, 2911, 345, 821, 2636, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 93,\n",
+       "  'seek': 17200,\n",
+       "  'start': 184.0,\n",
+       "  'end': 185.0,\n",
+       "  'text': ' Dead like it.',\n",
+       "  'tokens': [5542, 588, 340, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 94,\n",
+       "  'seek': 17200,\n",
+       "  'start': 185.0,\n",
+       "  'end': 187.0,\n",
+       "  'text': ' If I can guarantee every movie, okay?',\n",
+       "  'tokens': [1002, 314, 460, 9149, 790, 3807, 11, 8788, 30],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 95,\n",
+       "  'seek': 17200,\n",
+       "  'start': 187.0,\n",
+       "  'end': 189.0,\n",
+       "  'text': \" I don't think I'm gonna kill this.\",\n",
+       "  'tokens': [314, 836, 470, 892, 314, 1101, 8066, 1494, 428, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 96,\n",
+       "  'seek': 17200,\n",
+       "  'start': 189.0,\n",
+       "  'end': 191.0,\n",
+       "  'text': ' As I can hit by a car and die.',\n",
+       "  'tokens': [1081, 314, 460, 2277, 416, 257, 1097, 290, 4656, 13],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.2508346222259186,\n",
+       "  'compression_ratio': 1.4583333333333333,\n",
+       "  'no_speech_prob': 3.485897934751847e-07},\n",
+       " {'id': 97,\n",
+       "  'seek': 19100,\n",
+       "  'start': 191.0,\n",
+       "  'end': 196.0,\n",
+       "  'text': \" You don't\",\n",
+       "  'tokens': [921, 836, 470],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.0730719472847734,\n",
+       "  'compression_ratio': 1.3488372093023255,\n",
+       "  'no_speech_prob': 1.8071212252834812e-05},\n",
+       " {'id': 98,\n",
+       "  'seek': 19100,\n",
+       "  'start': 196.0,\n",
+       "  'end': 201.0,\n",
+       "  'text': \" work with me once you're left alone.\",\n",
+       "  'tokens': [670, 351, 502, 1752, 345, 821, 1364, 3436, 13],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.0730719472847734,\n",
+       "  'compression_ratio': 1.3488372093023255,\n",
+       "  'no_speech_prob': 1.8071212252834812e-05},\n",
+       " {'id': 99,\n",
+       "  'seek': 19100,\n",
+       "  'start': 201.0,\n",
+       "  'end': 205.0,\n",
+       "  'text': ' I lost you and you 2, 3 4',\n",
+       "  'tokens': [314, 2626, 345, 290, 345, 362, 11, 513, 604],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.0730719472847734,\n",
+       "  'compression_ratio': 1.3488372093023255,\n",
+       "  'no_speech_prob': 1.8071212252834812e-05},\n",
+       " {'id': 100,\n",
+       "  'seek': 19100,\n",
+       "  'start': 205.0,\n",
+       "  'end': 208.0,\n",
+       "  'text': ' I lost you.',\n",
+       "  'tokens': [314, 2626, 345, 13],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.0730719472847734,\n",
+       "  'compression_ratio': 1.3488372093023255,\n",
+       "  'no_speech_prob': 1.8071212252834812e-05},\n",
+       " {'id': 101,\n",
+       "  'seek': 19100,\n",
+       "  'start': 208.0,\n",
+       "  'end': 211.0,\n",
+       "  'text': ' I lost you, 2 4',\n",
+       "  'tokens': [314, 2626, 345, 11, 362, 604],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.0730719472847734,\n",
+       "  'compression_ratio': 1.3488372093023255,\n",
+       "  'no_speech_prob': 1.8071212252834812e-05},\n",
+       " {'id': 102,\n",
+       "  'seek': 19100,\n",
+       "  'start': 211.0,\n",
+       "  'end': 214.0,\n",
+       "  'text': ' I lost you, 2 5',\n",
+       "  'tokens': [314, 2626, 345, 11, 362, 642],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.0730719472847734,\n",
+       "  'compression_ratio': 1.3488372093023255,\n",
+       "  'no_speech_prob': 1.8071212252834812e-05},\n",
+       " {'id': 103,\n",
+       "  'seek': 21400,\n",
+       "  'start': 214.0,\n",
+       "  'end': 218.88,\n",
+       "  'text': \" I'm sorry.\",\n",
+       "  'tokens': [314, 1101, 7926, 13],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.222068927906178,\n",
+       "  'compression_ratio': 1.0,\n",
+       "  'no_speech_prob': 0.04370500147342682},\n",
+       " {'id': 104,\n",
+       "  'seek': 21400,\n",
+       "  'start': 218.88,\n",
+       "  'end': 220.54,\n",
+       "  'text': ' Sheila.',\n",
+       "  'tokens': [49627, 13],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.222068927906178,\n",
+       "  'compression_ratio': 1.0,\n",
+       "  'no_speech_prob': 0.04370500147342682},\n",
+       " {'id': 105,\n",
+       "  'seek': 21400,\n",
+       "  'start': 220.54,\n",
+       "  'end': 224.66,\n",
+       "  'text': \" I'm sorry.\",\n",
+       "  'tokens': [314, 1101, 7926, 13],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.222068927906178,\n",
+       "  'compression_ratio': 1.0,\n",
+       "  'no_speech_prob': 0.04370500147342682},\n",
+       " {'id': 106,\n",
+       "  'seek': 21400,\n",
+       "  'start': 224.66,\n",
+       "  'end': 234.78,\n",
+       "  'text': ' Did you know?',\n",
+       "  'tokens': [7731, 345, 760, 30],\n",
+       "  'temperature': 1.0,\n",
+       "  'avg_logprob': -2.222068927906178,\n",
+       "  'compression_ratio': 1.0,\n",
+       "  'no_speech_prob': 0.04370500147342682},\n",
+       " {'id': 107,\n",
+       "  'seek': 23478,\n",
+       "  'start': 234.78,\n",
+       "  'end': 236.78,\n",
+       "  'text': ' You',\n",
+       "  'tokens': [50363, 921, 50463],\n",
+       "  'temperature': 0.0,\n",
+       "  'avg_logprob': -0.9333540797233582,\n",
+       "  'compression_ratio': 0.2727272727272727,\n",
+       "  'no_speech_prob': 0.5948350429534912}]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "segments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23f31008",
+   "metadata": {},
+   "source": [
+    "## Know duration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5f0fe450",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wave\n",
+    "import contextlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "1015519a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "conversion to wav ready, duration of audio file: 258.93\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get duration\n",
+    "with contextlib.closing(wave.open(audio_file,'r')) as f:\n",
+    "    frames = f.getnframes()\n",
+    "    rate = f.getframerate()\n",
+    "    duration = frames / float(rate)\n",
+    "print(f\"conversion to wav ready, duration of audio file: {duration}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d54e4a6f",
+   "metadata": {},
+   "source": [
+    "## Speaker diarization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4f978ac0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/codespace/.local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pyannote.audio\n",
+    "from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding\n",
+    "from pyannote.audio import Audio\n",
+    "from pyannote.core import Segment\n",
+    "from tqdm.auto import tqdm\n",
+    "import numpy as np\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f9a28ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "[WinError 1314] Le client ne dispose pas d’un privilège nécessaire: 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\huggingface\\\\hub\\\\models--speechbrain--spkrec-ecapa-voxceleb\\\\snapshots\\\\5c0be3875fda05e81f3c004ed8c7c06be308de1e\\\\hyperparams.yaml' -> 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\torch\\\\pyannote\\\\speechbrain\\\\hyperparams.yaml'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[30], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m embedding_model \u001b[38;5;241m=\u001b[39m \u001b[43mPretrainedSpeakerEmbedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[0;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mspeechbrain/spkrec-ecapa-voxceleb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\pyannote\\audio\\pipelines\\speaker_verification.py:463\u001b[0m, in \u001b[0;36mPretrainedSpeakerEmbedding\u001b[1;34m(embedding, device, use_auth_token)\u001b[0m\n\u001b[0;32m    431\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Pretrained speaker embedding\u001b[39;00m\n\u001b[0;32m    432\u001b[0m \n\u001b[0;32m    433\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    459\u001b[0m \u001b[38;5;124;03m>>> embeddings = get_embedding(waveforms, masks=masks)\u001b[39;00m\n\u001b[0;32m    460\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    462\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(embedding, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mspeechbrain\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m embedding:\n\u001b[1;32m--> 463\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSpeechBrainPretrainedSpeakerEmbedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    464\u001b[0m \u001b[43m        \u001b[49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\n\u001b[0;32m    465\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    467\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(embedding, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnvidia\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m embedding:\n\u001b[0;32m    468\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m NeMoPretrainedSpeakerEmbedding(embedding, device\u001b[38;5;241m=\u001b[39mdevice)\n",
+      "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\pyannote\\audio\\pipelines\\speaker_verification.py:242\u001b[0m, in \u001b[0;36mSpeechBrainPretrainedSpeakerEmbedding.__init__\u001b[1;34m(self, embedding, device, use_auth_token)\u001b[0m\n\u001b[0;32m    239\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding \u001b[38;5;241m=\u001b[39m embedding\n\u001b[0;32m    240\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice \u001b[38;5;241m=\u001b[39m device\n\u001b[1;32m--> 242\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclassifier_ \u001b[38;5;241m=\u001b[39m \u001b[43mSpeechBrain_EncoderClassifier\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_hparams\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    243\u001b[0m \u001b[43m    \u001b[49m\u001b[43msource\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    244\u001b[0m \u001b[43m    \u001b[49m\u001b[43msavedir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mCACHE_DIR\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m/speechbrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    245\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrun_opts\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdevice\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    246\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    247\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\speechbrain\\pretrained\\interfaces.py:342\u001b[0m, in \u001b[0;36mPretrained.from_hparams\u001b[1;34m(cls, source, hparams_file, pymodule_file, overrides, savedir, use_auth_token, revision, **kwargs)\u001b[0m\n\u001b[0;32m    340\u001b[0m     clsname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\n\u001b[0;32m    341\u001b[0m     savedir \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./pretrained_models/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mclsname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhashlib\u001b[38;5;241m.\u001b[39mmd5(source\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUTF-8\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;250m \u001b[39merrors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreplace\u001b[39m\u001b[38;5;124m'\u001b[39m))\u001b[38;5;241m.\u001b[39mhexdigest()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 342\u001b[0m hparams_local_path \u001b[38;5;241m=\u001b[39m \u001b[43mfetch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    343\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhparams_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msavedir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\n\u001b[0;32m    344\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    345\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m    346\u001b[0m     pymodule_local_path \u001b[38;5;241m=\u001b[39m fetch(\n\u001b[0;32m    347\u001b[0m         pymodule_file, source, savedir, use_auth_token, revision\n\u001b[0;32m    348\u001b[0m     )\n",
+      "File \u001b[1;32m~\\AppData\\Local\\pypoetry\\Cache\\virtualenvs\\bechdelai-lU12Pf_x-py3.8\\lib\\site-packages\\speechbrain\\pretrained\\fetching.py:135\u001b[0m, in \u001b[0;36mfetch\u001b[1;34m(filename, source, savedir, overwrite, save_filename, use_auth_token, revision)\u001b[0m\n\u001b[0;32m    133\u001b[0m     sourcepath \u001b[38;5;241m=\u001b[39m pathlib\u001b[38;5;241m.\u001b[39mPath(fetched_file)\u001b[38;5;241m.\u001b[39mabsolute()\n\u001b[0;32m    134\u001b[0m     _missing_ok_unlink(destination)\n\u001b[1;32m--> 135\u001b[0m     \u001b[43mdestination\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msymlink_to\u001b[49m\u001b[43m(\u001b[49m\u001b[43msourcepath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m destination\n",
+      "File \u001b[1;32m~\\Anaconda3\\lib\\pathlib.py:1391\u001b[0m, in \u001b[0;36mPath.symlink_to\u001b[1;34m(self, target, target_is_directory)\u001b[0m\n\u001b[0;32m   1389\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_closed:\n\u001b[0;32m   1390\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_closed()\n\u001b[1;32m-> 1391\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_accessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msymlink\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_is_directory\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[1;31mOSError\u001b[0m: [WinError 1314] Le client ne dispose pas d’un privilège nécessaire: 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\huggingface\\\\hub\\\\models--speechbrain--spkrec-ecapa-voxceleb\\\\snapshots\\\\5c0be3875fda05e81f3c004ed8c7c06be308de1e\\\\hyperparams.yaml' -> 'C:\\\\Users\\\\theo.alvesdacosta\\\\.cache\\\\torch\\\\pyannote\\\\speechbrain\\\\hyperparams.yaml'"
+     ]
+    }
+   ],
+   "source": [
+    "embedding_model = PretrainedSpeakerEmbedding( \n",
+    "    \"speechbrain/spkrec-ecapa-voxceleb\",\n",
+    "    device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "42b9dc6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def segment_embedding(segment):\n",
+    "    audio = Audio()\n",
+    "    start = segment[\"start\"]\n",
+    "    # Whisper overshoots the end timestamp in the last segment\n",
+    "    end = min(duration, segment[\"end\"])\n",
+    "    clip = Segment(start, end)\n",
+    "    waveform, sample_rate = audio.crop(audio_file, clip)\n",
+    "    return embedding_model(waveform[None])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "6acb732e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0408c2dfc644f52832b1978b9967944",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/101 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'embedding_model' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[26], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(shape\u001b[38;5;241m=\u001b[39m(\u001b[38;5;28mlen\u001b[39m(segments), \u001b[38;5;241m192\u001b[39m))\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, segment \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(tqdm(segments)):\n\u001b[1;32m----> 3\u001b[0m     embeddings[i] \u001b[38;5;241m=\u001b[39m \u001b[43msegment_embedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43msegment\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      4\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mnan_to_num(embeddings)\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mEmbedding shape: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00membeddings\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
+      "Cell \u001b[1;32mIn[25], line 8\u001b[0m, in \u001b[0;36msegment_embedding\u001b[1;34m(segment)\u001b[0m\n\u001b[0;32m      6\u001b[0m clip \u001b[38;5;241m=\u001b[39m Segment(start, end)\n\u001b[0;32m      7\u001b[0m waveform, sample_rate \u001b[38;5;241m=\u001b[39m audio\u001b[38;5;241m.\u001b[39mcrop(audio_file, clip)\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43membedding_model\u001b[49m(waveform[\u001b[38;5;28;01mNone\u001b[39;00m])\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'embedding_model' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = np.zeros(shape=(len(segments), 192))\n",
+    "for i, segment in enumerate(tqdm(segments)):\n",
+    "    embeddings[i] = segment_embedding(segment)\n",
+    "embeddings = np.nan_to_num(embeddings)\n",
+    "print(f'Embedding shape: {embeddings.shape}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "537ea5b1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+git+https://github.com/huggingface/transformers
+git+https://github.com/pyannote/pyannote-audio
+git+https://github.com/openai/whisper.git
+gradio==3.12
+ffmpeg-python
+pandas==1.5.0
+pytube==12.1.0
+sacremoses
+sentencepiece
+tokenizers
+torch
+torchaudio
+tqdm==4.64.1
+EasyNMT==2.0.2
+nltk
+transformers
+pysrt
+psutil==5.9.2
+requests
+gpuinfo
+moviepy