Spaces:

zman1x1
/

yt-video-to-summary

Running

App Files Files Community

zman1x1 commited on Aug 23, 2023

Commit

3456a58

1 Parent(s): 67f56fe

Upload 21 files

Browse files

Files changed (18) hide show

app.py +136 -72
models/__pycache__/lamini.cpython-310.pyc +0 -0
models/distilbart_cnn_12_6.py +3 -1
models/lamini.py +80 -0
models/t5_small_medium_title_generation.py +1 -1
utils/__pycache__/chunk.cpython-310.pyc +0 -0
utils/__pycache__/log.cpython-310.pyc +0 -0
utils/__pycache__/markdown.cpython-310.pyc +0 -0
utils/__pycache__/marp_wrapper.cpython-310.pyc +0 -0
utils/__pycache__/ppt.cpython-310.pyc +0 -0
utils/__pycache__/subtitles.cpython-310.pyc +0 -0
utils/__pycache__/video.cpython-310.pyc +0 -0
utils/chunk.py +97 -0
utils/log.py +37 -0
utils/marp_wrapper.py +8 -1
utils/subtitles.py +0 -2
utils/utils.py +36 -0
utils/video.py +58 -11

app.py CHANGED Viewed

@@ -1,97 +1,161 @@
-# from utils.subtitles import getSubs
-# subs = getSubs("G8gEos8F9R0")
-# print(subs)
-# import json
-# with open("subs.json", "w") as f:
-#     json.dump(subs, f)
 import datetime
 import gradio as gr
-def greet(name):
-    pass
-def run_model(video_id, outname, chunk_size):
     CHUNK_SIZE = chunk_size
-    OUT_PPT_NAME = outname
     from rich.progress import track
-    from utils.subtitles import getSubsText
-    from models.distilbart_cnn_12_6 import summarize
-    from models.t5_small_medium_title_generation import t5model as generate_title
-    from utils.marp_wrapper import marp
     import utils.markdown as md
-    # from utils.chunk import LangChainChunker as chunker
-    from utils.subtitles import subs as chunker
     from utils.ppt import generate_ppt
     from utils.video import video
-    import os
-    # check for marp command
-    if os.system("command -v marp >> /dev/null") != 0:
-        print("Marp not found. Please install marp-cli.")
-        os.system("bash setup.sh")
-    else:
-        print("Marp found.")
-    # Intermediary Markdown file
-    print("Creating Markdown file...")
-    ppt = marp("summary.md")
-    ppt.add_header(
-        theme="uncover",
-        background="",
-        _class="invert",
     )
-    # smaller font size (1.5rem)
-    ppt.add_body("<style> section { font-size: 1.5rem; } </style>")
-    # Generate video
-    vid = video(f"https://youtu.be/{video_id}",
-                f"out/vid-{video_id}")
-    vid.download()
-    # Get the Subtitles from the YouTube video
-    print("Getting subtitles...")
-    chunker_init    = chunker(video_id)
-    chunks          = chunker_init.getSubsList(size=CHUNK_SIZE)
-    chunk_len       = len(chunks)
-    print(f"subtitles divided to {chunk_len} chunks")
-    chunk_num = 1
-    for chunk in track(chunks, description="Processing chunks"):
-        print(f"processing Chunk: {chunk_num}/{chunk_len}")
-        timestamp = str(datetime.timedelta(seconds=chunk[1]))
-        # TODO: better file path
-        img_path  = f"out/vid-{video_id}_{timestamp}.png"
-        summary = summarize(chunk[0])
-        vid.getframe(timestamp)
-        title = generate_title(summary)
-        ppt.add_page( md.h2(title), summary )
-        if os.path.exists(img_path):
-            ppt.add_body(md.image( img_path,
-            align="left", setAsBackground=True, size="contain"))
-        ppt.marp_end()
-        chunk_num += 1
-        continue
-    print(f"Generating {OUT_PPT_NAME}..")
-    ppt.close_file()
-    generate_ppt("summary.md", OUT_PPT_NAME)
-    # return full path to the ppt file
     return os.path.abspath(OUT_PPT_NAME)
-demo = gr.Interface(fn=run_model, inputs=["text", "text", gr.Slider(200, 1000)], outputs="file")
-demo.launch()

+import argparse
 import datetime
+import os
 import gradio as gr
+from signal import SIGINT, signal
+from utils.log import debug, info, logger, breakPoint as bc
+import requests
+from constants import *
+CHUNK_SIZE  =   512
+VIDEO_ID    =   ""
+OUT_PPT_NAME=   PPTX_DEST
+NO_IMAGES   =   False
+QUESTIONS   =   5
+def gradio_run(
+    video_id, chunk_size: int,
+    no_images: bool, no_chapters: bool, out_type="pdf"):
+    VIDEO_ID = video_id
     CHUNK_SIZE = chunk_size
+    NO_IMAGES = no_images
+    NO_CHAPTERS = no_chapters
+    OUT_PPT_NAME = f"{OUTDIR}/gradio-out{VIDEO_ID}.{out_type}"
+    info("Loading modules..")
+    from langchain.chains.summarize import load_summarize_chain
+    # from langchain.vectorstores import Chroma
+    # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+    # from langchain.chains import RetrievalQA
+    # from langchain.llms import HuggingFacePipeline
+    from langchain.docstore.document import Document
     from rich.progress import track
     import utils.markdown as md
+    from models.lamini import lamini as model
+    from utils.marp_wrapper import marp
     from utils.ppt import generate_ppt
+    from utils.subtitles import subs
     from utils.video import video
+    from utils.chunk import ChunkByChapters
+    # intialize marp
+    out = marp(MD_DEST)
+    out.add_header(config=MARP_GAIA)
+    # out.add_body("<style> section { font-size: 1.5rem; } </style>")
+    # initialize video
+    vid = video(VIDEO_ID, f"{OUTDIR}/vid-{VIDEO_ID}")
+    vid.download()
+    # initialize model
+    llm_model = model
+    llm = llm_model.load_model(
+            max_length=400,
+            temperature=0,
+            top_p=0.95,
+            repetition_penalty=1.15
     )
+    # slice subtitle and chunk them
+    # to CHUNK_SIZE based on chapters
+    info(f"Getting subtitles {VIDEO_ID}..")
+    raw_subs     = vid.getSubtitles()
+    if raw_subs is None:
+        logger.critical("No subtitles found, exiting..")
+        exit()
+    info(f"got {len(raw_subs)} length subtitles")
+    if NO_CHAPTERS:
+        chunker = subs(VIDEO_ID)
+        chunks = chunker.getSubsList(size=CHUNK_SIZE)
+        model_tmplts = llm_model.templates()
+        summarizer = model_tmplts.summarize
+        title_gen = model_tmplts.generate_title
+        # title Photo
+        first_pic = str(datetime.timedelta(seconds=chunks[0][1]))
+        img_name = f"vid-{VIDEO_ID}_{first_pic}.png"
+        img_path = f"{PNG_DEST}/{img_name}"
+        vid.getframe(first_pic, img_path)
+        out.add_page(md.h1(VIDEO_ID), md.image(url=img_name))
+        out.marp_end()
+        for chunk in track(chunks, description="(processing chunks) Summarizing.."):
+            summary = summarizer(chunk[0])[0]["generated_text"].replace("-", "\n-")
+            title = title_gen(chunk[0])[0]["generated_text"]
+            heading = md.h2 if len(title) < 40 else md.h3
+            out.add_page(heading(title), summary)
+            if not NO_IMAGES and len(summary+title) < 270:
+                timestamp = str(datetime.timedelta(seconds=chunk[1]))
+                imgName = f"vid-{VIDEO_ID}_{timestamp}.png"
+                imgPath = f"{PNG_DEST}/{imgName}"
+                vid.getframe(timestamp, imgPath)
+                out.add_body(md.image(imgName, align="left", setAsBackground=True))
+            out.marp_end()
+    else:
+        raw_chapters = vid.getChapters(f"{YT_CHAPTER_ENDPOINT}{VIDEO_ID}")
+        chunk_dict = ChunkByChapters(raw_chapters, raw_subs, CHUNK_SIZE)
+        chain = load_summarize_chain(llm, chain_type="stuff")
+            # TODO: ( use refine chain type to summarize all chapters )
+        img_hook = False
+        for title, subchunks in track(chunk_dict.items(), description="(processing chunks) Summarizing.."):
+            # Typecase subchunks to Document for every topic
+            # get summary for every topic with stuff/refine chain
+            # add to final summary
+            debug(subchunks)
+            docs = [ Document(page_content=t[0]) for t in subchunks[0] ]
+            summary = chain.run(docs)
+            if img_hook == False:
+                ts = str(datetime.timedelta(seconds=subchunks[0][1][0]))
+                img_path  = f"{PNG_DEST}/vid-{VIDEO_ID}_{ts}.png"
+                vid.getframe(ts, img_path)
+                if os.path.exists(img_path):
+                # if summary is long ignore images for better page and no clipping
+                    if len(summary+title) < 270:
+                        out.add_body(md.image(
+                                          img_path.replace(f"{OUTEXTRA}/", ""),
+                                          align="left",
+                                          setAsBackground=True
+                                  ))
+            out.add_page(md.h2(title), summary)
+            out.marp_end()
+    info(f"Generating {OUT_PPT_NAME}..")
+    out.close_file()
+    generate_ppt(MD_DEST, OUT_PPT_NAME)
+    print(f"Done! {OUT_PPT_NAME}")
     return os.path.abspath(OUT_PPT_NAME)
+def gradio_Interface():
+    app = gr.Interface(
+        fn=gradio_run,
+        inputs=[
+            "text",
+            gr.Slider(1, 2000, 1, label="Chunk Size", info="More chunk size = longer text & shorter numbber of slides"),
+            gr.Checkbox(label="No Images", info="Don't keep images in output ( gives more spaces for larger text)"),
+            gr.Checkbox(label="No Chapters", info="Don't use chapter based chunking"),
+            gr.Dropdown(["pptx", "pdf", "html"], label="file format", info="which file format to generte.")
+        ],
+        outputs="file"
+    )
+    app.launch()
+if __name__ == "__main__":
+    logger.info("Starting gradio interface..")
+    gradio_Interface()

models/__pycache__/lamini.cpython-310.pyc ADDED Viewed

Binary file (2.14 kB). View file

models/distilbart_cnn_12_6.py CHANGED Viewed

@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 # loading the model outside of the function makes it faster
 SUMMARIZATION_MODEL = "sshleifer/distilbart-cnn-12-6"
 tokenizer   = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
-model       = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
 def summarize(text, max_len=20):
     """
@@ -24,6 +24,8 @@ def summarize(text, max_len=20):
                        truncation=True,
     ).input_ids
     outputs = model.generate(inputs,
                             max_new_tokens=100,

 # loading the model outside of the function makes it faster
 SUMMARIZATION_MODEL = "sshleifer/distilbart-cnn-12-6"
 tokenizer   = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
+model       = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL, device_map="cuda:0")
 def summarize(text, max_len=20):
     """
                        truncation=True,
     ).input_ids
+    # Move the inputs tensor to the same device as the model tensor
+    inputs = inputs.cuda()
     outputs = model.generate(inputs,
                             max_new_tokens=100,

models/lamini.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Load model directly
+from langchain.llms import HuggingFacePipeline
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    pipeline,
+    GenerationConfig
+)
+model_id = "MBZUAI/LaMini-Flan-T5-248M"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="cuda:0")
+gen_config = GenerationConfig.from_pretrained(model_id)
+class lamini:
+    def __init__(self) -> None:
+        pass
+    def load_model(
+        task="text2text-generation",
+        **kwargs
+    ):
+        """Returns a pipeline for the model
+        - model: MBZUAI/LaMini-Flan-T5-248M
+        Returns:
+            _type_: _description_
+        """
+        max_length = kwargs.get("max_length", 512)
+        temperature = kwargs.get("temperature", 0)
+        top_p = kwargs.get("top_p", 0.95)
+        repetition_penalty = kwargs.get("repetition_penalty", 1.15)
+        pipe = pipeline(
+                "text2text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                generation_config=gen_config,
+                max_length=max_length,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+        )
+        llm = HuggingFacePipeline(pipeline=pipe)
+        return llm
+    class templates:
+        def summarize(self, text):
+            instructions = "summarize for better understanding: "
+            pipe = pipeline(
+                "text2text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_length=400,
+                generation_config=gen_config,
+                temperature=0,
+                top_p=0.95,
+                repetition_penalty=1.15
+            )
+            return pipe(instructions + text)
+        def generate_title(self, text):
+            instructions = "generate a perfect title for the following text in 6 words: "
+            pipe = pipeline(
+                "text2text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_length=60,
+                generation_config=gen_config,
+                temperature=0,
+                top_p=0.95,
+                repetition_penalty=1.15
+            )
+            return pipe(instructions + text)

models/t5_small_medium_title_generation.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 def t5model(prompt: str) -> str:
     tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-small-medium-title-generation")
-    model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-small-medium-title-generation", torch_dtype=torch.float32)
     inputs = tokenizer(
         ["summarize:" + prompt],
         return_tensors="pt",

 def t5model(prompt: str) -> str:
     tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-small-medium-title-generation")
+    model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-small-medium-title-generation", device_map="cuda:0", torch_dtype=torch.float16)
     inputs = tokenizer(
         ["summarize:" + prompt],
         return_tensors="pt",

utils/__pycache__/chunk.cpython-310.pyc ADDED Viewed

Binary file (3.18 kB). View file

utils/__pycache__/log.cpython-310.pyc ADDED Viewed

Binary file (1.14 kB). View file

utils/__pycache__/markdown.cpython-310.pyc ADDED Viewed

Binary file (1.87 kB). View file

utils/__pycache__/marp_wrapper.cpython-310.pyc ADDED Viewed

Binary file (2.34 kB). View file

utils/__pycache__/ppt.cpython-310.pyc ADDED Viewed

Binary file (665 Bytes). View file

utils/__pycache__/subtitles.cpython-310.pyc ADDED Viewed

Binary file (2.38 kB). View file

utils/__pycache__/video.cpython-310.pyc ADDED Viewed

Binary file (2.95 kB). View file

utils/chunk.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # divide the subs into chunks for more accurate summarization
 # TODO: divide the subs into chunks based on the topics
 # summarize each chunk and add it to the markdown file
 class legacy_chunker:
     # legacy manual chunker
@@ -50,3 +51,99 @@ class LangChainChunker:
         for _ in self.text:
             count += 1
         return count

 # divide the subs into chunks for more accurate summarization
 # TODO: divide the subs into chunks based on the topics
 # summarize each chunk and add it to the markdown file
+from rich.progress import track
 class legacy_chunker:
     # legacy manual chunker
         for _ in self.text:
             count += 1
         return count
+def ChunkByChapters(chapters: list, subs: list, size=1000):
+    """Chunk the youtube video  subtitles based on the chapters
+    Args:
+        chapters (list): Chapters from yt api
+        subs (list): subtitles from yt api
+        size (int, optional): _description_. Defaults to 1000.
+    Raises:
+        Exception: No chapters found
+    Returns:
+        list : structure chunk_dict = {
+              "chapter1": [
+                  [chunk1, chunk2, chunk3, ...],
+                  [chunk1_duration, chunk2_duration, chunk3_duration, ...]
+              ],
+              ...
+          }
+    """
+    chunks = []
+    chunk_dict = {}
+    # format chapters for chunking
+    Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
+    if len(chapters) == 0:
+        raise Exception("No chapters found")
+    else:
+        ## STEP 1:
+        # chapters timestamp is set to beggining of chapter
+        # to process all chapter subs instead of always checking if the sub is in the chapter
+        # its easier to set the timestamp to end of chapter
+        # set timestamp to last second of chapter
+        for c in range(len(Fchapters)-1):
+            if c == len(Fchapters):
+                break
+            Fchapters[c][1] = Fchapters[c+1][1] - 1
+        ## STEP 2: chunking based on chapters
+        # for each chapter, chunk the subs
+        # and add the chunk to the chunk_dict
+        #
+        #   chunk_dict = {
+        #       "chapter1": [
+        #           [chunk1, chunk2, chunk3, ...],
+        #           [chunk1_duration, chunk2_duration, chunk3_duration, ...]
+        #       ],
+        #       ...
+        #   }
+        #
+        for c in track(
+            range(len(Fchapters)-1),
+            description="Chunking by chapters: "
+        ):
+            title   = Fchapters[c][0]
+            # set the start and end of the chapter
+            start   = 0 if c == 0 else Fchapters[c-1][1]+1
+            end     = Fchapters[c][1]
+            current_chunk = ""
+            ## STEP 2 (a): process the subs
+            # for each sub, check if it is in the chapter
+            # if it is, add it to the current chunk
+            for sublinedata in subs:
+                cstart: int = sublinedata['start']
+                subline: str = sublinedata['text']
+                if cstart < start:
+                    continue
+                if cstart >= end:
+                    break
+                total_size = len(current_chunk) + len(subline)
+                if total_size + 1 < size:
+                    current_chunk += subline
+                else:
+                    chunks.append(
+                        [
+                            [current_chunk.strip()],
+                            [cstart],
+                        ]
+                    )
+                    current_chunk = ""
+            chunk_dict.update({title: chunks})
+            chunks = []
+    return chunk_dict

utils/log.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import logging
+from rich.logging import RichHandler
+FORMAT = "%(message)s"
+logging.basicConfig(
+    level="NOTSET",
+    format=FORMAT,
+    datefmt="[%X]",
+    handlers=[RichHandler(rich_tracebacks=True)],
+)
+logger = logging.getLogger("project-v-p")
+def warn(msg):
+    logger.warning(msg)
+def info(msg):
+    logger.info(msg)
+def debug(msg):
+    logger.debug(msg)
+def breakPoint():
+    """ A quick function to pause the program for debug or analysis
+    take user input Y/N
+    if Y: break
+    else: continue"""
+    print("Breakpoint: Press Y to continue, N to exit")
+    user_input = input()
+    if user_input.lower() == "y" or user_input == "":
+        pass
+    else:
+        logger.info("Exiting...")
+        exit()

utils/marp_wrapper.py CHANGED Viewed

@@ -16,7 +16,8 @@ class marp:
         paginate: bool = True,
         background: str = "",
         backgroundImage: str = None,
-        extra_styles: str = None
     ):
         ## write the header
         # ---
@@ -26,6 +27,12 @@ class marp:
         # backgroundColor: #fff
         # backgroundImage: url('https://marp.app/assets/hero-background.svg')
         # ---
         self.marp_write("---\n")
         self.marp_write("marp: true\n")
         self.marp_write(f"theme: {theme}\n")

         paginate: bool = True,
         background: str = "",
         backgroundImage: str = None,
+        extra_styles: str = None,
+        config: dict = None
     ):
         ## write the header
         # ---
         # backgroundColor: #fff
         # backgroundImage: url('https://marp.app/assets/hero-background.svg')
         # ---
+        if config is not None:
+            theme = config["theme"]
+            background = config["background"]
+            _class = config["class"]
         self.marp_write("---\n")
         self.marp_write("marp: true\n")
         self.marp_write(f"theme: {theme}\n")

utils/subtitles.py CHANGED Viewed

@@ -53,8 +53,6 @@ class subs:
         chunks = []
         current_chunk = "" # limited to {size}
         current_duaration = 0  # TODO: add better variable name
-        c_d_target = 2
-        c_d_count = 0
         for subline in subs:
             current_duaration = subline["start"]

         chunks = []
         current_chunk = "" # limited to {size}
         current_duaration = 0  # TODO: add better variable name
         for subline in subs:
             current_duaration = subline["start"]

utils/utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import subprocess, os
+def Popen(cmd: list) -> str:
+    """Run a command and return the output as a string
+    - example: print(Popen(["ls", "-l"]))
+    Args:
+        cmd (list): The command to run
+    Returns:
+        str: The output of the command
+    """
+    return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
+def getfilesR(path: str, sorted=False) -> list:
+    """Get all files in a directory recursively
+    Args:
+        path (str): The path to the directory. "." for current directory
+        sorted (bool, optional): Sort the files. Defaults to False.
+    Returns:
+        list: The list of files
+    """
+    files = []
+    # include depth
+    for r, d, f in os.walk(path):
+        for file in f:
+            files.append(os.path.join(r, file))
+    if sorted:
+        files.sort()
+    return files

utils/video.py CHANGED Viewed

@@ -1,4 +1,6 @@
-import subprocess, os
 def Popen(cmd: list) -> str:
     """Run a command and return the output as a string
@@ -12,9 +14,10 @@ def Popen(cmd: list) -> str:
     return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
 class video:
-    def __init__(self,url, path):
         self.path = path
-        self.url = url
         # check if directory exists
         if not os.path.exists(self.path.split("/")[-1]):
@@ -22,9 +25,9 @@ class video:
     def download(self):
         if os.path.exists(f"{self.path}.webm"):
-            print(f"{self.path}.webm already exists, skipping download")
             return
-        print(f"Downloading {self.url}")
         # (
         #     Popen(
         #             ["yt-dlp", self.url, "-o", self.path ]
@@ -32,14 +35,13 @@ class video:
         # )
         os.system(f"yt-dlp {self.url} -o {self.path}")
-    def getframe(self, timestamp):
-        filename = f"{self.path}_{timestamp}.png"
         if os.path.exists(filename):
-            print(f"{filename} already exists, skipping download")
             return
-        print(f"Getting frame at {timestamp}")
         (
             Popen(
                 [
@@ -49,7 +51,52 @@ class video:
                     "-ss", timestamp,
                     "-i", f"{self.path}.webm",
                     "-vframes", "1",
-                    f"{filename}"
                 ]
             )
         )

+import subprocess, os, requests, json
+from utils.log import info
+from utils.subtitles import subs
 def Popen(cmd: list) -> str:
     """Run a command and return the output as a string
     return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
 class video:
+    def __init__(self,id, path):
         self.path = path
+        self.url = "https://youtu.be/" + id
+        self.video_id = id
         # check if directory exists
         if not os.path.exists(self.path.split("/")[-1]):
     def download(self):
         if os.path.exists(f"{self.path}.webm"):
+            info(f"{self.path}.webm already exists, skipping download")
             return
+        info(f"Downloading {self.url}")
         # (
         #     Popen(
         #             ["yt-dlp", self.url, "-o", self.path ]
         # )
         os.system(f"yt-dlp {self.url} -o {self.path}")
+    def getframe(self, timestamp, out=os.curdir):
+        filename = out
         if os.path.exists(filename):
+            info(f"{filename} already exists, skipping frame")
             return
+        info(f"Getting frame at {timestamp}")
         (
             Popen(
                 [
                     "-ss", timestamp,
                     "-i", f"{self.path}.webm",
                     "-vframes", "1",
+                    filename
+                ]
+            )
+        )
+    def getAudio(self, out="out.mp3"):
+        info("Getting audio...")
+        (
+            Popen(
+                [
+                    "ffmpeg",
+                    "-hide_banner",
+                    "-loglevel", "panic",
+                    "-i", f"{self.path}.webm",
+                    "-vn",
+                    "-ar", "44100",
+                    "-ac", "2",
+                    "-ab", "192K",
+                    "-f", "mp3",
+                    out
                 ]
             )
         )
+    def getChapters(self, endpoint: str) -> list:
+        """return the chapters of the video
+        Args:
+            endpoint (str): endpoint to communicate to get chapters
+                            yt.lemnoslife.com recommended
+        Returns:
+            list: chapters
+        """
+        res = requests.get(f"{endpoint}")
+        chapters = res.json()['items'][0]['chapters']['chapters']
+        return chapters
+    def getSubtitles(self):
+        """return the raw subtitles
+        Returns:
+            list: subtitles directly from youtube
+        """
+        return json.loads(
+            json.dumps(
+                subs(self.video_id)
+                .getSubsRaw()
+            )
+        )