Spaces:
Running
Running
Upload 21 files
Browse files- app.py +136 -72
- models/__pycache__/lamini.cpython-310.pyc +0 -0
- models/distilbart_cnn_12_6.py +3 -1
- models/lamini.py +80 -0
- models/t5_small_medium_title_generation.py +1 -1
- utils/__pycache__/chunk.cpython-310.pyc +0 -0
- utils/__pycache__/log.cpython-310.pyc +0 -0
- utils/__pycache__/markdown.cpython-310.pyc +0 -0
- utils/__pycache__/marp_wrapper.cpython-310.pyc +0 -0
- utils/__pycache__/ppt.cpython-310.pyc +0 -0
- utils/__pycache__/subtitles.cpython-310.pyc +0 -0
- utils/__pycache__/video.cpython-310.pyc +0 -0
- utils/chunk.py +97 -0
- utils/log.py +37 -0
- utils/marp_wrapper.py +8 -1
- utils/subtitles.py +0 -2
- utils/utils.py +36 -0
- utils/video.py +58 -11
app.py
CHANGED
@@ -1,97 +1,161 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
# subs = getSubs("G8gEos8F9R0")
|
4 |
-
# print(subs)
|
5 |
-
|
6 |
-
# import json
|
7 |
-
|
8 |
-
# with open("subs.json", "w") as f:
|
9 |
-
# json.dump(subs, f)
|
10 |
-
|
11 |
import datetime
|
|
|
12 |
import gradio as gr
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
pass
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
CHUNK_SIZE = chunk_size
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
from rich.progress import track
|
21 |
-
|
22 |
-
from models.distilbart_cnn_12_6 import summarize
|
23 |
-
from models.t5_small_medium_title_generation import t5model as generate_title
|
24 |
-
from utils.marp_wrapper import marp
|
25 |
import utils.markdown as md
|
26 |
-
|
27 |
-
from utils.
|
28 |
from utils.ppt import generate_ppt
|
|
|
29 |
from utils.video import video
|
30 |
-
import
|
31 |
-
|
32 |
-
# check for marp command
|
33 |
-
if os.system("command -v marp >> /dev/null") != 0:
|
34 |
-
print("Marp not found. Please install marp-cli.")
|
35 |
-
os.system("bash setup.sh")
|
36 |
-
else:
|
37 |
-
print("Marp found.")
|
38 |
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
#
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
)
|
48 |
|
49 |
-
#
|
50 |
-
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
vid.download()
|
56 |
|
57 |
-
|
58 |
-
print("Getting subtitles...")
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
76 |
|
77 |
-
ppt.add_page( md.h2(title), summary )
|
78 |
|
79 |
-
if os.path.exists(img_path):
|
80 |
-
ppt.add_body(md.image( img_path,
|
81 |
-
align="left", setAsBackground=True, size="contain"))
|
82 |
-
|
83 |
-
ppt.marp_end()
|
84 |
-
chunk_num += 1
|
85 |
-
continue
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
|
91 |
-
# return full path to the ppt file
|
92 |
return os.path.abspath(OUT_PPT_NAME)
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
|
|
|
|
|
|
1 |
+
import argparse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import datetime
|
3 |
+
import os
|
4 |
import gradio as gr
|
5 |
+
from signal import SIGINT, signal
|
6 |
+
from utils.log import debug, info, logger, breakPoint as bc
|
7 |
|
8 |
+
import requests
|
|
|
9 |
|
10 |
+
from constants import *
|
11 |
+
|
12 |
+
CHUNK_SIZE = 512
|
13 |
+
VIDEO_ID = ""
|
14 |
+
OUT_PPT_NAME= PPTX_DEST
|
15 |
+
NO_IMAGES = False
|
16 |
+
QUESTIONS = 5
|
17 |
+
|
18 |
+
def gradio_run(
|
19 |
+
video_id, chunk_size: int,
|
20 |
+
no_images: bool, no_chapters: bool, out_type="pdf"):
|
21 |
+
|
22 |
+
VIDEO_ID = video_id
|
23 |
CHUNK_SIZE = chunk_size
|
24 |
+
NO_IMAGES = no_images
|
25 |
+
NO_CHAPTERS = no_chapters
|
26 |
+
OUT_PPT_NAME = f"{OUTDIR}/gradio-out{VIDEO_ID}.{out_type}"
|
27 |
+
|
28 |
+
info("Loading modules..")
|
29 |
+
from langchain.chains.summarize import load_summarize_chain
|
30 |
+
# from langchain.vectorstores import Chroma
|
31 |
+
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
32 |
+
# from langchain.chains import RetrievalQA
|
33 |
+
# from langchain.llms import HuggingFacePipeline
|
34 |
+
from langchain.docstore.document import Document
|
35 |
from rich.progress import track
|
36 |
+
|
|
|
|
|
|
|
37 |
import utils.markdown as md
|
38 |
+
from models.lamini import lamini as model
|
39 |
+
from utils.marp_wrapper import marp
|
40 |
from utils.ppt import generate_ppt
|
41 |
+
from utils.subtitles import subs
|
42 |
from utils.video import video
|
43 |
+
from utils.chunk import ChunkByChapters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
# intialize marp
|
46 |
+
out = marp(MD_DEST)
|
47 |
+
out.add_header(config=MARP_GAIA)
|
48 |
+
# out.add_body("<style> section { font-size: 1.5rem; } </style>")
|
49 |
|
50 |
+
# initialize video
|
51 |
+
vid = video(VIDEO_ID, f"{OUTDIR}/vid-{VIDEO_ID}")
|
52 |
+
vid.download()
|
53 |
+
|
54 |
+
# initialize model
|
55 |
+
llm_model = model
|
56 |
+
llm = llm_model.load_model(
|
57 |
+
max_length=400,
|
58 |
+
temperature=0,
|
59 |
+
top_p=0.95,
|
60 |
+
repetition_penalty=1.15
|
61 |
)
|
62 |
|
63 |
+
# slice subtitle and chunk them
|
64 |
+
# to CHUNK_SIZE based on chapters
|
65 |
+
info(f"Getting subtitles {VIDEO_ID}..")
|
66 |
+
raw_subs = vid.getSubtitles()
|
67 |
|
68 |
+
if raw_subs is None:
|
69 |
+
logger.critical("No subtitles found, exiting..")
|
70 |
+
exit()
|
|
|
71 |
|
72 |
+
info(f"got {len(raw_subs)} length subtitles")
|
|
|
73 |
|
74 |
+
|
75 |
+
if NO_CHAPTERS:
|
76 |
+
chunker = subs(VIDEO_ID)
|
77 |
+
chunks = chunker.getSubsList(size=CHUNK_SIZE)
|
78 |
+
model_tmplts = llm_model.templates()
|
79 |
+
summarizer = model_tmplts.summarize
|
80 |
+
title_gen = model_tmplts.generate_title
|
81 |
+
|
82 |
+
# title Photo
|
83 |
+
first_pic = str(datetime.timedelta(seconds=chunks[0][1]))
|
84 |
+
img_name = f"vid-{VIDEO_ID}_{first_pic}.png"
|
85 |
+
img_path = f"{PNG_DEST}/{img_name}"
|
86 |
+
vid.getframe(first_pic, img_path)
|
87 |
+
out.add_page(md.h1(VIDEO_ID), md.image(url=img_name))
|
88 |
+
out.marp_end()
|
89 |
+
|
90 |
+
for chunk in track(chunks, description="(processing chunks) Summarizing.."):
|
91 |
+
summary = summarizer(chunk[0])[0]["generated_text"].replace("-", "\n-")
|
92 |
+
title = title_gen(chunk[0])[0]["generated_text"]
|
93 |
+
|
94 |
+
heading = md.h2 if len(title) < 40 else md.h3
|
95 |
+
out.add_page(heading(title), summary)
|
96 |
+
|
97 |
+
if not NO_IMAGES and len(summary+title) < 270:
|
98 |
+
timestamp = str(datetime.timedelta(seconds=chunk[1]))
|
99 |
+
imgName = f"vid-{VIDEO_ID}_{timestamp}.png"
|
100 |
+
imgPath = f"{PNG_DEST}/{imgName}"
|
101 |
+
vid.getframe(timestamp, imgPath)
|
102 |
+
out.add_body(md.image(imgName, align="left", setAsBackground=True))
|
103 |
+
|
104 |
+
out.marp_end()
|
105 |
+
else:
|
106 |
+
raw_chapters = vid.getChapters(f"{YT_CHAPTER_ENDPOINT}{VIDEO_ID}")
|
107 |
+
chunk_dict = ChunkByChapters(raw_chapters, raw_subs, CHUNK_SIZE)
|
108 |
+
chain = load_summarize_chain(llm, chain_type="stuff")
|
109 |
+
# TODO: ( use refine chain type to summarize all chapters )
|
110 |
+
img_hook = False
|
111 |
+
for title, subchunks in track(chunk_dict.items(), description="(processing chunks) Summarizing.."):
|
112 |
+
# Typecase subchunks to Document for every topic
|
113 |
+
# get summary for every topic with stuff/refine chain
|
114 |
+
# add to final summary
|
115 |
|
116 |
+
debug(subchunks)
|
117 |
+
docs = [ Document(page_content=t[0]) for t in subchunks[0] ]
|
118 |
+
summary = chain.run(docs)
|
119 |
|
120 |
+
if img_hook == False:
|
121 |
+
ts = str(datetime.timedelta(seconds=subchunks[0][1][0]))
|
122 |
+
img_path = f"{PNG_DEST}/vid-{VIDEO_ID}_{ts}.png"
|
123 |
+
vid.getframe(ts, img_path)
|
124 |
+
if os.path.exists(img_path):
|
125 |
+
# if summary is long ignore images for better page and no clipping
|
126 |
+
if len(summary+title) < 270:
|
127 |
+
out.add_body(md.image(
|
128 |
+
img_path.replace(f"{OUTEXTRA}/", ""),
|
129 |
+
align="left",
|
130 |
+
setAsBackground=True
|
131 |
+
))
|
132 |
+
out.add_page(md.h2(title), summary)
|
133 |
+
out.marp_end()
|
134 |
|
|
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
|
138 |
+
info(f"Generating {OUT_PPT_NAME}..")
|
139 |
+
out.close_file()
|
140 |
+
generate_ppt(MD_DEST, OUT_PPT_NAME)
|
141 |
+
print(f"Done! {OUT_PPT_NAME}")
|
142 |
|
|
|
143 |
return os.path.abspath(OUT_PPT_NAME)
|
144 |
|
145 |
+
def gradio_Interface():
|
146 |
+
app = gr.Interface(
|
147 |
+
fn=gradio_run,
|
148 |
+
inputs=[
|
149 |
+
"text",
|
150 |
+
gr.Slider(1, 2000, 1, label="Chunk Size", info="More chunk size = longer text & shorter numbber of slides"),
|
151 |
+
gr.Checkbox(label="No Images", info="Don't keep images in output ( gives more spaces for larger text)"),
|
152 |
+
gr.Checkbox(label="No Chapters", info="Don't use chapter based chunking"),
|
153 |
+
gr.Dropdown(["pptx", "pdf", "html"], label="file format", info="which file format to generte.")
|
154 |
+
],
|
155 |
+
outputs="file"
|
156 |
+
)
|
157 |
+
app.launch()
|
158 |
|
159 |
+
if __name__ == "__main__":
|
160 |
+
logger.info("Starting gradio interface..")
|
161 |
+
gradio_Interface()
|
models/__pycache__/lamini.cpython-310.pyc
ADDED
Binary file (2.14 kB). View file
|
|
models/distilbart_cnn_12_6.py
CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
4 |
# loading the model outside of the function makes it faster
|
5 |
SUMMARIZATION_MODEL = "sshleifer/distilbart-cnn-12-6"
|
6 |
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
|
7 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
|
8 |
|
9 |
def summarize(text, max_len=20):
|
10 |
"""
|
@@ -24,6 +24,8 @@ def summarize(text, max_len=20):
|
|
24 |
truncation=True,
|
25 |
).input_ids
|
26 |
|
|
|
|
|
27 |
|
28 |
outputs = model.generate(inputs,
|
29 |
max_new_tokens=100,
|
|
|
4 |
# loading the model outside of the function makes it faster
|
5 |
SUMMARIZATION_MODEL = "sshleifer/distilbart-cnn-12-6"
|
6 |
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
|
7 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL, device_map="cuda:0")
|
8 |
|
9 |
def summarize(text, max_len=20):
|
10 |
"""
|
|
|
24 |
truncation=True,
|
25 |
).input_ids
|
26 |
|
27 |
+
# Move the inputs tensor to the same device as the model tensor
|
28 |
+
inputs = inputs.cuda()
|
29 |
|
30 |
outputs = model.generate(inputs,
|
31 |
max_new_tokens=100,
|
models/lamini.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load model directly
|
2 |
+
from langchain.llms import HuggingFacePipeline
|
3 |
+
from transformers import (
|
4 |
+
AutoTokenizer,
|
5 |
+
AutoModelForSeq2SeqLM,
|
6 |
+
pipeline,
|
7 |
+
GenerationConfig
|
8 |
+
)
|
9 |
+
|
10 |
+
model_id = "MBZUAI/LaMini-Flan-T5-248M"
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
12 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="cuda:0")
|
13 |
+
gen_config = GenerationConfig.from_pretrained(model_id)
|
14 |
+
|
15 |
+
class lamini:
|
16 |
+
def __init__(self) -> None:
|
17 |
+
pass
|
18 |
+
|
19 |
+
def load_model(
|
20 |
+
task="text2text-generation",
|
21 |
+
**kwargs
|
22 |
+
):
|
23 |
+
"""Returns a pipeline for the model
|
24 |
+
- model: MBZUAI/LaMini-Flan-T5-248M
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
_type_: _description_
|
28 |
+
"""
|
29 |
+
|
30 |
+
max_length = kwargs.get("max_length", 512)
|
31 |
+
temperature = kwargs.get("temperature", 0)
|
32 |
+
top_p = kwargs.get("top_p", 0.95)
|
33 |
+
repetition_penalty = kwargs.get("repetition_penalty", 1.15)
|
34 |
+
|
35 |
+
pipe = pipeline(
|
36 |
+
"text2text-generation",
|
37 |
+
model=model,
|
38 |
+
tokenizer=tokenizer,
|
39 |
+
generation_config=gen_config,
|
40 |
+
max_length=max_length,
|
41 |
+
top_p=top_p,
|
42 |
+
temperature=temperature,
|
43 |
+
repetition_penalty=repetition_penalty,
|
44 |
+
)
|
45 |
+
|
46 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
47 |
+
return llm
|
48 |
+
|
49 |
+
class templates:
|
50 |
+
|
51 |
+
def summarize(self, text):
|
52 |
+
instructions = "summarize for better understanding: "
|
53 |
+
pipe = pipeline(
|
54 |
+
"text2text-generation",
|
55 |
+
model=model,
|
56 |
+
tokenizer=tokenizer,
|
57 |
+
max_length=400,
|
58 |
+
generation_config=gen_config,
|
59 |
+
temperature=0,
|
60 |
+
top_p=0.95,
|
61 |
+
repetition_penalty=1.15
|
62 |
+
)
|
63 |
+
return pipe(instructions + text)
|
64 |
+
|
65 |
+
def generate_title(self, text):
|
66 |
+
instructions = "generate a perfect title for the following text in 6 words: "
|
67 |
+
|
68 |
+
pipe = pipeline(
|
69 |
+
"text2text-generation",
|
70 |
+
model=model,
|
71 |
+
tokenizer=tokenizer,
|
72 |
+
max_length=60,
|
73 |
+
generation_config=gen_config,
|
74 |
+
temperature=0,
|
75 |
+
top_p=0.95,
|
76 |
+
repetition_penalty=1.15
|
77 |
+
)
|
78 |
+
|
79 |
+
return pipe(instructions + text)
|
80 |
+
|
models/t5_small_medium_title_generation.py
CHANGED
@@ -4,7 +4,7 @@ import torch
|
|
4 |
|
5 |
def t5model(prompt: str) -> str:
|
6 |
tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-small-medium-title-generation")
|
7 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-small-medium-title-generation", torch_dtype=torch.
|
8 |
inputs = tokenizer(
|
9 |
["summarize:" + prompt],
|
10 |
return_tensors="pt",
|
|
|
4 |
|
5 |
def t5model(prompt: str) -> str:
|
6 |
tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-small-medium-title-generation")
|
7 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-small-medium-title-generation", device_map="cuda:0", torch_dtype=torch.float16)
|
8 |
inputs = tokenizer(
|
9 |
["summarize:" + prompt],
|
10 |
return_tensors="pt",
|
utils/__pycache__/chunk.cpython-310.pyc
ADDED
Binary file (3.18 kB). View file
|
|
utils/__pycache__/log.cpython-310.pyc
ADDED
Binary file (1.14 kB). View file
|
|
utils/__pycache__/markdown.cpython-310.pyc
ADDED
Binary file (1.87 kB). View file
|
|
utils/__pycache__/marp_wrapper.cpython-310.pyc
ADDED
Binary file (2.34 kB). View file
|
|
utils/__pycache__/ppt.cpython-310.pyc
ADDED
Binary file (665 Bytes). View file
|
|
utils/__pycache__/subtitles.cpython-310.pyc
ADDED
Binary file (2.38 kB). View file
|
|
utils/__pycache__/video.cpython-310.pyc
ADDED
Binary file (2.95 kB). View file
|
|
utils/chunk.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# divide the subs into chunks for more accurate summarization
|
2 |
# TODO: divide the subs into chunks based on the topics
|
3 |
# summarize each chunk and add it to the markdown file
|
|
|
4 |
|
5 |
class legacy_chunker:
|
6 |
# legacy manual chunker
|
@@ -50,3 +51,99 @@ class LangChainChunker:
|
|
50 |
for _ in self.text:
|
51 |
count += 1
|
52 |
return count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# divide the subs into chunks for more accurate summarization
|
2 |
# TODO: divide the subs into chunks based on the topics
|
3 |
# summarize each chunk and add it to the markdown file
|
4 |
+
from rich.progress import track
|
5 |
|
6 |
class legacy_chunker:
|
7 |
# legacy manual chunker
|
|
|
51 |
for _ in self.text:
|
52 |
count += 1
|
53 |
return count
|
54 |
+
|
55 |
+
def ChunkByChapters(chapters: list, subs: list, size=1000):
|
56 |
+
"""Chunk the youtube video subtitles based on the chapters
|
57 |
+
|
58 |
+
Args:
|
59 |
+
chapters (list): Chapters from yt api
|
60 |
+
subs (list): subtitles from yt api
|
61 |
+
size (int, optional): _description_. Defaults to 1000.
|
62 |
+
|
63 |
+
Raises:
|
64 |
+
Exception: No chapters found
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
list : structure chunk_dict = {
|
68 |
+
"chapter1": [
|
69 |
+
[chunk1, chunk2, chunk3, ...],
|
70 |
+
[chunk1_duration, chunk2_duration, chunk3_duration, ...]
|
71 |
+
],
|
72 |
+
...
|
73 |
+
}
|
74 |
+
"""
|
75 |
+
chunks = []
|
76 |
+
chunk_dict = {}
|
77 |
+
|
78 |
+
# format chapters for chunking
|
79 |
+
Fchapters = [[chapter['title'], chapter['time']] for chapter in chapters]
|
80 |
+
|
81 |
+
if len(chapters) == 0:
|
82 |
+
raise Exception("No chapters found")
|
83 |
+
else:
|
84 |
+
|
85 |
+
## STEP 1:
|
86 |
+
# chapters timestamp is set to beggining of chapter
|
87 |
+
# to process all chapter subs instead of always checking if the sub is in the chapter
|
88 |
+
# its easier to set the timestamp to end of chapter
|
89 |
+
# set timestamp to last second of chapter
|
90 |
+
for c in range(len(Fchapters)-1):
|
91 |
+
if c == len(Fchapters):
|
92 |
+
break
|
93 |
+
Fchapters[c][1] = Fchapters[c+1][1] - 1
|
94 |
+
|
95 |
+
|
96 |
+
## STEP 2: chunking based on chapters
|
97 |
+
# for each chapter, chunk the subs
|
98 |
+
# and add the chunk to the chunk_dict
|
99 |
+
#
|
100 |
+
# chunk_dict = {
|
101 |
+
# "chapter1": [
|
102 |
+
# [chunk1, chunk2, chunk3, ...],
|
103 |
+
# [chunk1_duration, chunk2_duration, chunk3_duration, ...]
|
104 |
+
# ],
|
105 |
+
# ...
|
106 |
+
# }
|
107 |
+
#
|
108 |
+
|
109 |
+
for c in track(
|
110 |
+
range(len(Fchapters)-1),
|
111 |
+
description="Chunking by chapters: "
|
112 |
+
):
|
113 |
+
title = Fchapters[c][0]
|
114 |
+
|
115 |
+
# set the start and end of the chapter
|
116 |
+
start = 0 if c == 0 else Fchapters[c-1][1]+1
|
117 |
+
end = Fchapters[c][1]
|
118 |
+
|
119 |
+
current_chunk = ""
|
120 |
+
|
121 |
+
## STEP 2 (a): process the subs
|
122 |
+
# for each sub, check if it is in the chapter
|
123 |
+
# if it is, add it to the current chunk
|
124 |
+
|
125 |
+
for sublinedata in subs:
|
126 |
+
cstart: int = sublinedata['start']
|
127 |
+
subline: str = sublinedata['text']
|
128 |
+
|
129 |
+
if cstart < start:
|
130 |
+
continue
|
131 |
+
if cstart >= end:
|
132 |
+
break
|
133 |
+
|
134 |
+
total_size = len(current_chunk) + len(subline)
|
135 |
+
if total_size + 1 < size:
|
136 |
+
current_chunk += subline
|
137 |
+
else:
|
138 |
+
chunks.append(
|
139 |
+
[
|
140 |
+
[current_chunk.strip()],
|
141 |
+
[cstart],
|
142 |
+
]
|
143 |
+
)
|
144 |
+
current_chunk = ""
|
145 |
+
|
146 |
+
chunk_dict.update({title: chunks})
|
147 |
+
chunks = []
|
148 |
+
|
149 |
+
return chunk_dict
|
utils/log.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from rich.logging import RichHandler
|
3 |
+
|
4 |
+
FORMAT = "%(message)s"
|
5 |
+
logging.basicConfig(
|
6 |
+
level="NOTSET",
|
7 |
+
format=FORMAT,
|
8 |
+
datefmt="[%X]",
|
9 |
+
handlers=[RichHandler(rich_tracebacks=True)],
|
10 |
+
)
|
11 |
+
|
12 |
+
logger = logging.getLogger("project-v-p")
|
13 |
+
|
14 |
+
def warn(msg):
|
15 |
+
logger.warning(msg)
|
16 |
+
|
17 |
+
def info(msg):
|
18 |
+
logger.info(msg)
|
19 |
+
|
20 |
+
def debug(msg):
|
21 |
+
logger.debug(msg)
|
22 |
+
|
23 |
+
def breakPoint():
|
24 |
+
|
25 |
+
""" A quick function to pause the program for debug or analysis
|
26 |
+
|
27 |
+
take user input Y/N
|
28 |
+
if Y: break
|
29 |
+
else: continue"""
|
30 |
+
|
31 |
+
print("Breakpoint: Press Y to continue, N to exit")
|
32 |
+
user_input = input()
|
33 |
+
if user_input.lower() == "y" or user_input == "":
|
34 |
+
pass
|
35 |
+
else:
|
36 |
+
logger.info("Exiting...")
|
37 |
+
exit()
|
utils/marp_wrapper.py
CHANGED
@@ -16,7 +16,8 @@ class marp:
|
|
16 |
paginate: bool = True,
|
17 |
background: str = "",
|
18 |
backgroundImage: str = None,
|
19 |
-
extra_styles: str = None
|
|
|
20 |
):
|
21 |
## write the header
|
22 |
# ---
|
@@ -26,6 +27,12 @@ class marp:
|
|
26 |
# backgroundColor: #fff
|
27 |
# backgroundImage: url('https://marp.app/assets/hero-background.svg')
|
28 |
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
self.marp_write("---\n")
|
30 |
self.marp_write("marp: true\n")
|
31 |
self.marp_write(f"theme: {theme}\n")
|
|
|
16 |
paginate: bool = True,
|
17 |
background: str = "",
|
18 |
backgroundImage: str = None,
|
19 |
+
extra_styles: str = None,
|
20 |
+
config: dict = None
|
21 |
):
|
22 |
## write the header
|
23 |
# ---
|
|
|
27 |
# backgroundColor: #fff
|
28 |
# backgroundImage: url('https://marp.app/assets/hero-background.svg')
|
29 |
# ---
|
30 |
+
|
31 |
+
if config is not None:
|
32 |
+
theme = config["theme"]
|
33 |
+
background = config["background"]
|
34 |
+
_class = config["class"]
|
35 |
+
|
36 |
self.marp_write("---\n")
|
37 |
self.marp_write("marp: true\n")
|
38 |
self.marp_write(f"theme: {theme}\n")
|
utils/subtitles.py
CHANGED
@@ -53,8 +53,6 @@ class subs:
|
|
53 |
chunks = []
|
54 |
current_chunk = "" # limited to {size}
|
55 |
current_duaration = 0 # TODO: add better variable name
|
56 |
-
c_d_target = 2
|
57 |
-
c_d_count = 0
|
58 |
|
59 |
for subline in subs:
|
60 |
current_duaration = subline["start"]
|
|
|
53 |
chunks = []
|
54 |
current_chunk = "" # limited to {size}
|
55 |
current_duaration = 0 # TODO: add better variable name
|
|
|
|
|
56 |
|
57 |
for subline in subs:
|
58 |
current_duaration = subline["start"]
|
utils/utils.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess, os
|
2 |
+
|
3 |
+
def Popen(cmd: list) -> str:
|
4 |
+
"""Run a command and return the output as a string
|
5 |
+
|
6 |
+
- example: print(Popen(["ls", "-l"]))
|
7 |
+
|
8 |
+
Args:
|
9 |
+
cmd (list): The command to run
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
str: The output of the command
|
13 |
+
"""
|
14 |
+
return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
|
15 |
+
|
16 |
+
def getfilesR(path: str, sorted=False) -> list:
|
17 |
+
"""Get all files in a directory recursively
|
18 |
+
|
19 |
+
Args:
|
20 |
+
path (str): The path to the directory. "." for current directory
|
21 |
+
sorted (bool, optional): Sort the files. Defaults to False.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
list: The list of files
|
25 |
+
"""
|
26 |
+
|
27 |
+
files = []
|
28 |
+
# include depth
|
29 |
+
for r, d, f in os.walk(path):
|
30 |
+
for file in f:
|
31 |
+
files.append(os.path.join(r, file))
|
32 |
+
|
33 |
+
if sorted:
|
34 |
+
files.sort()
|
35 |
+
|
36 |
+
return files
|
utils/video.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
import subprocess, os
|
|
|
|
|
2 |
|
3 |
def Popen(cmd: list) -> str:
|
4 |
"""Run a command and return the output as a string
|
@@ -12,9 +14,10 @@ def Popen(cmd: list) -> str:
|
|
12 |
return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
|
13 |
|
14 |
class video:
|
15 |
-
def __init__(self,
|
16 |
self.path = path
|
17 |
-
self.url =
|
|
|
18 |
|
19 |
# check if directory exists
|
20 |
if not os.path.exists(self.path.split("/")[-1]):
|
@@ -22,9 +25,9 @@ class video:
|
|
22 |
|
23 |
def download(self):
|
24 |
if os.path.exists(f"{self.path}.webm"):
|
25 |
-
|
26 |
return
|
27 |
-
|
28 |
# (
|
29 |
# Popen(
|
30 |
# ["yt-dlp", self.url, "-o", self.path ]
|
@@ -32,14 +35,13 @@ class video:
|
|
32 |
# )
|
33 |
os.system(f"yt-dlp {self.url} -o {self.path}")
|
34 |
|
35 |
-
def getframe(self, timestamp):
|
36 |
-
filename =
|
37 |
-
|
38 |
if os.path.exists(filename):
|
39 |
-
|
40 |
return
|
41 |
|
42 |
-
|
43 |
(
|
44 |
Popen(
|
45 |
[
|
@@ -49,7 +51,52 @@ class video:
|
|
49 |
"-ss", timestamp,
|
50 |
"-i", f"{self.path}.webm",
|
51 |
"-vframes", "1",
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
]
|
54 |
)
|
55 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess, os, requests, json
|
2 |
+
from utils.log import info
|
3 |
+
from utils.subtitles import subs
|
4 |
|
5 |
def Popen(cmd: list) -> str:
|
6 |
"""Run a command and return the output as a string
|
|
|
14 |
return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
|
15 |
|
16 |
class video:
|
17 |
+
def __init__(self,id, path):
|
18 |
self.path = path
|
19 |
+
self.url = "https://youtu.be/" + id
|
20 |
+
self.video_id = id
|
21 |
|
22 |
# check if directory exists
|
23 |
if not os.path.exists(self.path.split("/")[-1]):
|
|
|
25 |
|
26 |
def download(self):
|
27 |
if os.path.exists(f"{self.path}.webm"):
|
28 |
+
info(f"{self.path}.webm already exists, skipping download")
|
29 |
return
|
30 |
+
info(f"Downloading {self.url}")
|
31 |
# (
|
32 |
# Popen(
|
33 |
# ["yt-dlp", self.url, "-o", self.path ]
|
|
|
35 |
# )
|
36 |
os.system(f"yt-dlp {self.url} -o {self.path}")
|
37 |
|
38 |
+
def getframe(self, timestamp, out=os.curdir):
|
39 |
+
filename = out
|
|
|
40 |
if os.path.exists(filename):
|
41 |
+
info(f"{filename} already exists, skipping frame")
|
42 |
return
|
43 |
|
44 |
+
info(f"Getting frame at {timestamp}")
|
45 |
(
|
46 |
Popen(
|
47 |
[
|
|
|
51 |
"-ss", timestamp,
|
52 |
"-i", f"{self.path}.webm",
|
53 |
"-vframes", "1",
|
54 |
+
filename
|
55 |
+
]
|
56 |
+
)
|
57 |
+
)
|
58 |
+
|
59 |
+
def getAudio(self, out="out.mp3"):
|
60 |
+
info("Getting audio...")
|
61 |
+
(
|
62 |
+
Popen(
|
63 |
+
[
|
64 |
+
"ffmpeg",
|
65 |
+
"-hide_banner",
|
66 |
+
"-loglevel", "panic",
|
67 |
+
"-i", f"{self.path}.webm",
|
68 |
+
"-vn",
|
69 |
+
"-ar", "44100",
|
70 |
+
"-ac", "2",
|
71 |
+
"-ab", "192K",
|
72 |
+
"-f", "mp3",
|
73 |
+
out
|
74 |
]
|
75 |
)
|
76 |
)
|
77 |
+
|
78 |
+
def getChapters(self, endpoint: str) -> list:
|
79 |
+
"""return the chapters of the video
|
80 |
+
|
81 |
+
Args:
|
82 |
+
endpoint (str): endpoint to communicate to get chapters
|
83 |
+
yt.lemnoslife.com recommended
|
84 |
+
Returns:
|
85 |
+
list: chapters
|
86 |
+
"""
|
87 |
+
res = requests.get(f"{endpoint}")
|
88 |
+
chapters = res.json()['items'][0]['chapters']['chapters']
|
89 |
+
return chapters
|
90 |
+
|
91 |
+
def getSubtitles(self):
|
92 |
+
"""return the raw subtitles
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
list: subtitles directly from youtube
|
96 |
+
"""
|
97 |
+
return json.loads(
|
98 |
+
json.dumps(
|
99 |
+
subs(self.video_id)
|
100 |
+
.getSubsRaw()
|
101 |
+
)
|
102 |
+
)
|