zman1x1 commited on
Commit
3456a58
·
1 Parent(s): 67f56fe

Upload 21 files

Browse files
app.py CHANGED
@@ -1,97 +1,161 @@
1
- # from utils.subtitles import getSubs
2
-
3
- # subs = getSubs("G8gEos8F9R0")
4
- # print(subs)
5
-
6
- # import json
7
-
8
- # with open("subs.json", "w") as f:
9
- # json.dump(subs, f)
10
-
11
  import datetime
 
12
  import gradio as gr
 
 
13
 
14
- def greet(name):
15
- pass
16
 
17
- def run_model(video_id, outname, chunk_size):
 
 
 
 
 
 
 
 
 
 
 
 
18
  CHUNK_SIZE = chunk_size
19
- OUT_PPT_NAME = outname
 
 
 
 
 
 
 
 
 
 
20
  from rich.progress import track
21
- from utils.subtitles import getSubsText
22
- from models.distilbart_cnn_12_6 import summarize
23
- from models.t5_small_medium_title_generation import t5model as generate_title
24
- from utils.marp_wrapper import marp
25
  import utils.markdown as md
26
- # from utils.chunk import LangChainChunker as chunker
27
- from utils.subtitles import subs as chunker
28
  from utils.ppt import generate_ppt
 
29
  from utils.video import video
30
- import os
31
-
32
- # check for marp command
33
- if os.system("command -v marp >> /dev/null") != 0:
34
- print("Marp not found. Please install marp-cli.")
35
- os.system("bash setup.sh")
36
- else:
37
- print("Marp found.")
38
 
 
 
 
 
39
 
40
- # Intermediary Markdown file
41
- print("Creating Markdown file...")
42
- ppt = marp("summary.md")
43
- ppt.add_header(
44
- theme="uncover",
45
- background="",
46
- _class="invert",
 
 
 
 
47
  )
48
 
49
- # smaller font size (1.5rem)
50
- ppt.add_body("<style> section { font-size: 1.5rem; } </style>")
 
 
51
 
52
- # Generate video
53
- vid = video(f"https://youtu.be/{video_id}",
54
- f"out/vid-{video_id}")
55
- vid.download()
56
 
57
- # Get the Subtitles from the YouTube video
58
- print("Getting subtitles...")
59
 
60
- chunker_init = chunker(video_id)
61
- chunks = chunker_init.getSubsList(size=CHUNK_SIZE)
62
- chunk_len = len(chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- print(f"subtitles divided to {chunk_len} chunks")
 
 
65
 
66
- chunk_num = 1
67
- for chunk in track(chunks, description="Processing chunks"):
68
- print(f"processing Chunk: {chunk_num}/{chunk_len}")
69
- timestamp = str(datetime.timedelta(seconds=chunk[1]))
70
- # TODO: better file path
71
- img_path = f"out/vid-{video_id}_{timestamp}.png"
72
-
73
- summary = summarize(chunk[0])
74
- vid.getframe(timestamp)
75
- title = generate_title(summary)
 
 
 
 
76
 
77
- ppt.add_page( md.h2(title), summary )
78
 
79
- if os.path.exists(img_path):
80
- ppt.add_body(md.image( img_path,
81
- align="left", setAsBackground=True, size="contain"))
82
-
83
- ppt.marp_end()
84
- chunk_num += 1
85
- continue
86
 
87
- print(f"Generating {OUT_PPT_NAME}..")
88
- ppt.close_file()
89
- generate_ppt("summary.md", OUT_PPT_NAME)
 
 
90
 
91
- # return full path to the ppt file
92
  return os.path.abspath(OUT_PPT_NAME)
93
 
94
-
95
- demo = gr.Interface(fn=run_model, inputs=["text", "text", gr.Slider(200, 1000)], outputs="file")
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- demo.launch()
 
 
 
1
+ import argparse
 
 
 
 
 
 
 
 
 
2
  import datetime
3
+ import os
4
  import gradio as gr
5
+ from signal import SIGINT, signal
6
+ from utils.log import debug, info, logger, breakPoint as bc
7
 
8
+ import requests
 
9
 
10
+ from constants import *
11
+
12
+ CHUNK_SIZE = 512
13
+ VIDEO_ID = ""
14
+ OUT_PPT_NAME= PPTX_DEST
15
+ NO_IMAGES = False
16
+ QUESTIONS = 5
17
+
18
+ def gradio_run(
19
+ video_id, chunk_size: int,
20
+ no_images: bool, no_chapters: bool, out_type="pdf"):
21
+
22
+ VIDEO_ID = video_id
23
  CHUNK_SIZE = chunk_size
24
+ NO_IMAGES = no_images
25
+ NO_CHAPTERS = no_chapters
26
+ OUT_PPT_NAME = f"{OUTDIR}/gradio-out{VIDEO_ID}.{out_type}"
27
+
28
+ info("Loading modules..")
29
+ from langchain.chains.summarize import load_summarize_chain
30
+ # from langchain.vectorstores import Chroma
31
+ # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
32
+ # from langchain.chains import RetrievalQA
33
+ # from langchain.llms import HuggingFacePipeline
34
+ from langchain.docstore.document import Document
35
  from rich.progress import track
36
+
 
 
 
37
  import utils.markdown as md
38
+ from models.lamini import lamini as model
39
+ from utils.marp_wrapper import marp
40
  from utils.ppt import generate_ppt
41
+ from utils.subtitles import subs
42
  from utils.video import video
43
+ from utils.chunk import ChunkByChapters
 
 
 
 
 
 
 
44
 
45
+ # intialize marp
46
+ out = marp(MD_DEST)
47
+ out.add_header(config=MARP_GAIA)
48
+ # out.add_body("<style> section { font-size: 1.5rem; } </style>")
49
 
50
+ # initialize video
51
+ vid = video(VIDEO_ID, f"{OUTDIR}/vid-{VIDEO_ID}")
52
+ vid.download()
53
+
54
+ # initialize model
55
+ llm_model = model
56
+ llm = llm_model.load_model(
57
+ max_length=400,
58
+ temperature=0,
59
+ top_p=0.95,
60
+ repetition_penalty=1.15
61
  )
62
 
63
+ # slice subtitle and chunk them
64
+ # to CHUNK_SIZE based on chapters
65
+ info(f"Getting subtitles {VIDEO_ID}..")
66
+ raw_subs = vid.getSubtitles()
67
 
68
+ if raw_subs is None:
69
+ logger.critical("No subtitles found, exiting..")
70
+ exit()
 
71
 
72
+ info(f"got {len(raw_subs)} length subtitles")
 
73
 
74
+
75
+ if NO_CHAPTERS:
76
+ chunker = subs(VIDEO_ID)
77
+ chunks = chunker.getSubsList(size=CHUNK_SIZE)
78
+ model_tmplts = llm_model.templates()
79
+ summarizer = model_tmplts.summarize
80
+ title_gen = model_tmplts.generate_title
81
+
82
+ # title Photo
83
+ first_pic = str(datetime.timedelta(seconds=chunks[0][1]))
84
+ img_name = f"vid-{VIDEO_ID}_{first_pic}.png"
85
+ img_path = f"{PNG_DEST}/{img_name}"
86
+ vid.getframe(first_pic, img_path)
87
+ out.add_page(md.h1(VIDEO_ID), md.image(url=img_name))
88
+ out.marp_end()
89
+
90
+ for chunk in track(chunks, description="(processing chunks) Summarizing.."):
91
+ summary = summarizer(chunk[0])[0]["generated_text"].replace("-", "\n-")
92
+ title = title_gen(chunk[0])[0]["generated_text"]
93
+
94
+ heading = md.h2 if len(title) < 40 else md.h3
95
+ out.add_page(heading(title), summary)
96
+
97
+ if not NO_IMAGES and len(summary+title) < 270:
98
+ timestamp = str(datetime.timedelta(seconds=chunk[1]))
99
+ imgName = f"vid-{VIDEO_ID}_{timestamp}.png"
100
+ imgPath = f"{PNG_DEST}/{imgName}"
101
+ vid.getframe(timestamp, imgPath)
102
+ out.add_body(md.image(imgName, align="left", setAsBackground=True))
103
+
104
+ out.marp_end()
105
+ else:
106
+ raw_chapters = vid.getChapters(f"{YT_CHAPTER_ENDPOINT}{VIDEO_ID}")
107
+ chunk_dict = ChunkByChapters(raw_chapters, raw_subs, CHUNK_SIZE)
108
+ chain = load_summarize_chain(llm, chain_type="stuff")
109
+ # TODO: ( use refine chain type to summarize all chapters )
110
+ img_hook = False
111
+ for title, subchunks in track(chunk_dict.items(), description="(processing chunks) Summarizing.."):
112
+ # Typecase subchunks to Document for every topic
113
+ # get summary for every topic with stuff/refine chain
114
+ # add to final summary
115
 
116
+ debug(subchunks)
117
+ docs = [ Document(page_content=t[0]) for t in subchunks[0] ]
118
+ summary = chain.run(docs)
119
 
120
+ if img_hook == False:
121
+ ts = str(datetime.timedelta(seconds=subchunks[0][1][0]))
122
+ img_path = f"{PNG_DEST}/vid-{VIDEO_ID}_{ts}.png"
123
+ vid.getframe(ts, img_path)
124
+ if os.path.exists(img_path):
125
+ # if summary is long ignore images for better page and no clipping
126
+ if len(summary+title) < 270:
127
+ out.add_body(md.image(
128
+ img_path.replace(f"{OUTEXTRA}/", ""),
129
+ align="left",
130
+ setAsBackground=True
131
+ ))
132
+ out.add_page(md.h2(title), summary)
133
+ out.marp_end()
134
 
 
135
 
 
 
 
 
 
 
 
136
 
137
+
138
+ info(f"Generating {OUT_PPT_NAME}..")
139
+ out.close_file()
140
+ generate_ppt(MD_DEST, OUT_PPT_NAME)
141
+ print(f"Done! {OUT_PPT_NAME}")
142
 
 
143
  return os.path.abspath(OUT_PPT_NAME)
144
 
145
+ def gradio_Interface():
146
+ app = gr.Interface(
147
+ fn=gradio_run,
148
+ inputs=[
149
+ "text",
150
+ gr.Slider(1, 2000, 1, label="Chunk Size", info="More chunk size = longer text & shorter numbber of slides"),
151
+ gr.Checkbox(label="No Images", info="Don't keep images in output ( gives more spaces for larger text)"),
152
+ gr.Checkbox(label="No Chapters", info="Don't use chapter based chunking"),
153
+ gr.Dropdown(["pptx", "pdf", "html"], label="file format", info="which file format to generte.")
154
+ ],
155
+ outputs="file"
156
+ )
157
+ app.launch()
158
 
159
+ if __name__ == "__main__":
160
+ logger.info("Starting gradio interface..")
161
+ gradio_Interface()
models/__pycache__/lamini.cpython-310.pyc ADDED
Binary file (2.14 kB). View file
 
models/distilbart_cnn_12_6.py CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  # loading the model outside of the function makes it faster
5
  SUMMARIZATION_MODEL = "sshleifer/distilbart-cnn-12-6"
6
  tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
7
- model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
8
 
9
  def summarize(text, max_len=20):
10
  """
@@ -24,6 +24,8 @@ def summarize(text, max_len=20):
24
  truncation=True,
25
  ).input_ids
26
 
 
 
27
 
28
  outputs = model.generate(inputs,
29
  max_new_tokens=100,
 
4
  # loading the model outside of the function makes it faster
5
  SUMMARIZATION_MODEL = "sshleifer/distilbart-cnn-12-6"
6
  tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
7
+ model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL, device_map="cuda:0")
8
 
9
  def summarize(text, max_len=20):
10
  """
 
24
  truncation=True,
25
  ).input_ids
26
 
27
+ # Move the inputs tensor to the same device as the model tensor
28
+ inputs = inputs.cuda()
29
 
30
  outputs = model.generate(inputs,
31
  max_new_tokens=100,
models/lamini.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load model directly
2
+ from langchain.llms import HuggingFacePipeline
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ AutoModelForSeq2SeqLM,
6
+ pipeline,
7
+ GenerationConfig
8
+ )
9
+
10
+ model_id = "MBZUAI/LaMini-Flan-T5-248M"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
12
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="cuda:0")
13
+ gen_config = GenerationConfig.from_pretrained(model_id)
14
+
15
+ class lamini:
16
+ def __init__(self) -> None:
17
+ pass
18
+
19
+ def load_model(
20
+ task="text2text-generation",
21
+ **kwargs
22
+ ):
23
+ """Returns a pipeline for the model
24
+ - model: MBZUAI/LaMini-Flan-T5-248M
25
+
26
+ Returns:
27
+ _type_: _description_
28
+ """
29
+
30
+ max_length = kwargs.get("max_length", 512)
31
+ temperature = kwargs.get("temperature", 0)
32
+ top_p = kwargs.get("top_p", 0.95)
33
+ repetition_penalty = kwargs.get("repetition_penalty", 1.15)
34
+
35
+ pipe = pipeline(
36
+ "text2text-generation",
37
+ model=model,
38
+ tokenizer=tokenizer,
39
+ generation_config=gen_config,
40
+ max_length=max_length,
41
+ top_p=top_p,
42
+ temperature=temperature,
43
+ repetition_penalty=repetition_penalty,
44
+ )
45
+
46
+ llm = HuggingFacePipeline(pipeline=pipe)
47
+ return llm
48
+
49
+ class templates:
50
+
51
+ def summarize(self, text):
52
+ instructions = "summarize for better understanding: "
53
+ pipe = pipeline(
54
+ "text2text-generation",
55
+ model=model,
56
+ tokenizer=tokenizer,
57
+ max_length=400,
58
+ generation_config=gen_config,
59
+ temperature=0,
60
+ top_p=0.95,
61
+ repetition_penalty=1.15
62
+ )
63
+ return pipe(instructions + text)
64
+
65
+ def generate_title(self, text):
66
+ instructions = "generate a perfect title for the following text in 6 words: "
67
+
68
+ pipe = pipeline(
69
+ "text2text-generation",
70
+ model=model,
71
+ tokenizer=tokenizer,
72
+ max_length=60,
73
+ generation_config=gen_config,
74
+ temperature=0,
75
+ top_p=0.95,
76
+ repetition_penalty=1.15
77
+ )
78
+
79
+ return pipe(instructions + text)
80
+
models/t5_small_medium_title_generation.py CHANGED
@@ -4,7 +4,7 @@ import torch
4
 
5
  def t5model(prompt: str) -> str:
6
  tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-small-medium-title-generation")
7
- model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-small-medium-title-generation", torch_dtype=torch.float32)
8
  inputs = tokenizer(
9
  ["summarize:" + prompt],
10
  return_tensors="pt",
 
4
 
5
  def t5model(prompt: str) -> str:
6
  tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-small-medium-title-generation")
7
+ model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-small-medium-title-generation", device_map="cuda:0", torch_dtype=torch.float16)
8
  inputs = tokenizer(
9
  ["summarize:" + prompt],
10
  return_tensors="pt",
utils/__pycache__/chunk.cpython-310.pyc ADDED
Binary file (3.18 kB). View file
 
utils/__pycache__/log.cpython-310.pyc ADDED
Binary file (1.14 kB). View file
 
utils/__pycache__/markdown.cpython-310.pyc ADDED
Binary file (1.87 kB). View file
 
utils/__pycache__/marp_wrapper.cpython-310.pyc ADDED
Binary file (2.34 kB). View file
 
utils/__pycache__/ppt.cpython-310.pyc ADDED
Binary file (665 Bytes). View file
 
utils/__pycache__/subtitles.cpython-310.pyc ADDED
Binary file (2.38 kB). View file
 
utils/__pycache__/video.cpython-310.pyc ADDED
Binary file (2.95 kB). View file
 
utils/chunk.py CHANGED
@@ -1,6 +1,7 @@
1
  # divide the subs into chunks for more accurate summarization
2
  # TODO: divide the subs into chunks based on the topics
3
  # summarize each chunk and add it to the markdown file
 
4
 
5
  class legacy_chunker:
6
  # legacy manual chunker
@@ -50,3 +51,99 @@ class LangChainChunker:
50
  for _ in self.text:
51
  count += 1
52
  return count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # divide the subs into chunks for more accurate summarization
2
  # TODO: divide the subs into chunks based on the topics
3
  # summarize each chunk and add it to the markdown file
4
+ from rich.progress import track
5
 
6
  class legacy_chunker:
7
  # legacy manual chunker
 
51
  for _ in self.text:
52
  count += 1
53
  return count
54
+
55
+ def ChunkByChapters(chapters: list, subs: list, size=1000):
56
+ """Chunk the youtube video subtitles based on the chapters
57
+
58
+ Args:
59
+ chapters (list): Chapters from yt api
60
+ subs (list): subtitles from yt api
61
+ size (int, optional): _description_. Defaults to 1000.
62
+
63
+ Raises:
64
+ Exception: No chapters found
65
+
66
+ Returns:
67
+ list : structure chunk_dict = {
68
+ "chapter1": [
69
+ [chunk1, chunk2, chunk3, ...],
70
+ [chunk1_duration, chunk2_duration, chunk3_duration, ...]
71
+ ],
72
+ ...
73
+ }
74
+ """
75
+ chunks = []
76
+ chunk_dict = {}
77
+
78
+ # format chapters for chunking
79
+ Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
80
+
81
+ if len(chapters) == 0:
82
+ raise Exception("No chapters found")
83
+ else:
84
+
85
+ ## STEP 1:
86
+ # chapters timestamp is set to beggining of chapter
87
+ # to process all chapter subs instead of always checking if the sub is in the chapter
88
+ # its easier to set the timestamp to end of chapter
89
+ # set timestamp to last second of chapter
90
+ for c in range(len(Fchapters)-1):
91
+ if c == len(Fchapters):
92
+ break
93
+ Fchapters[c][1] = Fchapters[c+1][1] - 1
94
+
95
+
96
+ ## STEP 2: chunking based on chapters
97
+ # for each chapter, chunk the subs
98
+ # and add the chunk to the chunk_dict
99
+ #
100
+ # chunk_dict = {
101
+ # "chapter1": [
102
+ # [chunk1, chunk2, chunk3, ...],
103
+ # [chunk1_duration, chunk2_duration, chunk3_duration, ...]
104
+ # ],
105
+ # ...
106
+ # }
107
+ #
108
+
109
+ for c in track(
110
+ range(len(Fchapters)-1),
111
+ description="Chunking by chapters: "
112
+ ):
113
+ title = Fchapters[c][0]
114
+
115
+ # set the start and end of the chapter
116
+ start = 0 if c == 0 else Fchapters[c-1][1]+1
117
+ end = Fchapters[c][1]
118
+
119
+ current_chunk = ""
120
+
121
+ ## STEP 2 (a): process the subs
122
+ # for each sub, check if it is in the chapter
123
+ # if it is, add it to the current chunk
124
+
125
+ for sublinedata in subs:
126
+ cstart: int = sublinedata['start']
127
+ subline: str = sublinedata['text']
128
+
129
+ if cstart < start:
130
+ continue
131
+ if cstart >= end:
132
+ break
133
+
134
+ total_size = len(current_chunk) + len(subline)
135
+ if total_size + 1 < size:
136
+ current_chunk += subline
137
+ else:
138
+ chunks.append(
139
+ [
140
+ [current_chunk.strip()],
141
+ [cstart],
142
+ ]
143
+ )
144
+ current_chunk = ""
145
+
146
+ chunk_dict.update({title: chunks})
147
+ chunks = []
148
+
149
+ return chunk_dict
utils/log.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from rich.logging import RichHandler
3
+
4
+ FORMAT = "%(message)s"
5
+ logging.basicConfig(
6
+ level="NOTSET",
7
+ format=FORMAT,
8
+ datefmt="[%X]",
9
+ handlers=[RichHandler(rich_tracebacks=True)],
10
+ )
11
+
12
+ logger = logging.getLogger("project-v-p")
13
+
14
+ def warn(msg):
15
+ logger.warning(msg)
16
+
17
+ def info(msg):
18
+ logger.info(msg)
19
+
20
+ def debug(msg):
21
+ logger.debug(msg)
22
+
23
+ def breakPoint():
24
+
25
+ """ A quick function to pause the program for debug or analysis
26
+
27
+ take user input Y/N
28
+ if Y: break
29
+ else: continue"""
30
+
31
+ print("Breakpoint: Press Y to continue, N to exit")
32
+ user_input = input()
33
+ if user_input.lower() == "y" or user_input == "":
34
+ pass
35
+ else:
36
+ logger.info("Exiting...")
37
+ exit()
utils/marp_wrapper.py CHANGED
@@ -16,7 +16,8 @@ class marp:
16
  paginate: bool = True,
17
  background: str = "",
18
  backgroundImage: str = None,
19
- extra_styles: str = None
 
20
  ):
21
  ## write the header
22
  # ---
@@ -26,6 +27,12 @@ class marp:
26
  # backgroundColor: #fff
27
  # backgroundImage: url('https://marp.app/assets/hero-background.svg')
28
  # ---
 
 
 
 
 
 
29
  self.marp_write("---\n")
30
  self.marp_write("marp: true\n")
31
  self.marp_write(f"theme: {theme}\n")
 
16
  paginate: bool = True,
17
  background: str = "",
18
  backgroundImage: str = None,
19
+ extra_styles: str = None,
20
+ config: dict = None
21
  ):
22
  ## write the header
23
  # ---
 
27
  # backgroundColor: #fff
28
  # backgroundImage: url('https://marp.app/assets/hero-background.svg')
29
  # ---
30
+
31
+ if config is not None:
32
+ theme = config["theme"]
33
+ background = config["background"]
34
+ _class = config["class"]
35
+
36
  self.marp_write("---\n")
37
  self.marp_write("marp: true\n")
38
  self.marp_write(f"theme: {theme}\n")
utils/subtitles.py CHANGED
@@ -53,8 +53,6 @@ class subs:
53
  chunks = []
54
  current_chunk = "" # limited to {size}
55
  current_duaration = 0 # TODO: add better variable name
56
- c_d_target = 2
57
- c_d_count = 0
58
 
59
  for subline in subs:
60
  current_duaration = subline["start"]
 
53
  chunks = []
54
  current_chunk = "" # limited to {size}
55
  current_duaration = 0 # TODO: add better variable name
 
 
56
 
57
  for subline in subs:
58
  current_duaration = subline["start"]
utils/utils.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess, os
2
+
3
+ def Popen(cmd: list) -> str:
4
+ """Run a command and return the output as a string
5
+
6
+ - example: print(Popen(["ls", "-l"]))
7
+
8
+ Args:
9
+ cmd (list): The command to run
10
+
11
+ Returns:
12
+ str: The output of the command
13
+ """
14
+ return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
15
+
16
+ def getfilesR(path: str, sorted=False) -> list:
17
+ """Get all files in a directory recursively
18
+
19
+ Args:
20
+ path (str): The path to the directory. "." for current directory
21
+ sorted (bool, optional): Sort the files. Defaults to False.
22
+
23
+ Returns:
24
+ list: The list of files
25
+ """
26
+
27
+ files = []
28
+ # include depth
29
+ for r, d, f in os.walk(path):
30
+ for file in f:
31
+ files.append(os.path.join(r, file))
32
+
33
+ if sorted:
34
+ files.sort()
35
+
36
+ return files
utils/video.py CHANGED
@@ -1,4 +1,6 @@
1
- import subprocess, os
 
 
2
 
3
  def Popen(cmd: list) -> str:
4
  """Run a command and return the output as a string
@@ -12,9 +14,10 @@ def Popen(cmd: list) -> str:
12
  return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
13
 
14
  class video:
15
- def __init__(self,url, path):
16
  self.path = path
17
- self.url = url
 
18
 
19
  # check if directory exists
20
  if not os.path.exists(self.path.split("/")[-1]):
@@ -22,9 +25,9 @@ class video:
22
 
23
  def download(self):
24
  if os.path.exists(f"{self.path}.webm"):
25
- print(f"{self.path}.webm already exists, skipping download")
26
  return
27
- print(f"Downloading {self.url}")
28
  # (
29
  # Popen(
30
  # ["yt-dlp", self.url, "-o", self.path ]
@@ -32,14 +35,13 @@ class video:
32
  # )
33
  os.system(f"yt-dlp {self.url} -o {self.path}")
34
 
35
- def getframe(self, timestamp):
36
- filename = f"{self.path}_{timestamp}.png"
37
-
38
  if os.path.exists(filename):
39
- print(f"{filename} already exists, skipping download")
40
  return
41
 
42
- print(f"Getting frame at {timestamp}")
43
  (
44
  Popen(
45
  [
@@ -49,7 +51,52 @@ class video:
49
  "-ss", timestamp,
50
  "-i", f"{self.path}.webm",
51
  "-vframes", "1",
52
- f"{filename}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  ]
54
  )
55
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess, os, requests, json
2
+ from utils.log import info
3
+ from utils.subtitles import subs
4
 
5
  def Popen(cmd: list) -> str:
6
  """Run a command and return the output as a string
 
14
  return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
15
 
16
  class video:
17
+ def __init__(self,id, path):
18
  self.path = path
19
+ self.url = "https://youtu.be/" + id
20
+ self.video_id = id
21
 
22
  # check if directory exists
23
  if not os.path.exists(self.path.split("/")[-1]):
 
25
 
26
  def download(self):
27
  if os.path.exists(f"{self.path}.webm"):
28
+ info(f"{self.path}.webm already exists, skipping download")
29
  return
30
+ info(f"Downloading {self.url}")
31
  # (
32
  # Popen(
33
  # ["yt-dlp", self.url, "-o", self.path ]
 
35
  # )
36
  os.system(f"yt-dlp {self.url} -o {self.path}")
37
 
38
+ def getframe(self, timestamp, out=os.curdir):
39
+ filename = out
 
40
  if os.path.exists(filename):
41
+ info(f"{filename} already exists, skipping frame")
42
  return
43
 
44
+ info(f"Getting frame at {timestamp}")
45
  (
46
  Popen(
47
  [
 
51
  "-ss", timestamp,
52
  "-i", f"{self.path}.webm",
53
  "-vframes", "1",
54
+ filename
55
+ ]
56
+ )
57
+ )
58
+
59
+ def getAudio(self, out="out.mp3"):
60
+ info("Getting audio...")
61
+ (
62
+ Popen(
63
+ [
64
+ "ffmpeg",
65
+ "-hide_banner",
66
+ "-loglevel", "panic",
67
+ "-i", f"{self.path}.webm",
68
+ "-vn",
69
+ "-ar", "44100",
70
+ "-ac", "2",
71
+ "-ab", "192K",
72
+ "-f", "mp3",
73
+ out
74
  ]
75
  )
76
  )
77
+
78
+ def getChapters(self, endpoint: str) -> list:
79
+ """return the chapters of the video
80
+
81
+ Args:
82
+ endpoint (str): endpoint to communicate to get chapters
83
+ yt.lemnoslife.com recommended
84
+ Returns:
85
+ list: chapters
86
+ """
87
+ res = requests.get(f"{endpoint}")
88
+ chapters = res.json()['items'][0]['chapters']['chapters']
89
+ return chapters
90
+
91
+ def getSubtitles(self):
92
+ """return the raw subtitles
93
+
94
+ Returns:
95
+ list: subtitles directly from youtube
96
+ """
97
+ return json.loads(
98
+ json.dumps(
99
+ subs(self.video_id)
100
+ .getSubsRaw()
101
+ )
102
+ )