app file
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +44 -48
- frames.py +102 -0
- lexrank.py +24 -0
- myrpunct/__init__.py +2 -0
- myrpunct/__pycache__/__init__.cpython-310.pyc +0 -0
- myrpunct/__pycache__/__init__.cpython-39.pyc +0 -0
- myrpunct/__pycache__/punctuate.cpython-310.pyc +0 -0
- myrpunct/__pycache__/punctuate.cpython-39.pyc +0 -0
- myrpunct/punctuate.py +174 -0
- myrpunct/utils.py +34 -0
- pytube/__init__.py +19 -0
- pytube/__main__.py +467 -0
- pytube/__pycache__/__init__.cpython-310.pyc +0 -0
- pytube/__pycache__/__init__.cpython-39.pyc +0 -0
- pytube/__pycache__/__main__.cpython-310.pyc +0 -0
- pytube/__pycache__/__main__.cpython-39.pyc +0 -0
- pytube/__pycache__/captions.cpython-310.pyc +0 -0
- pytube/__pycache__/captions.cpython-39.pyc +0 -0
- pytube/__pycache__/cipher.cpython-310.pyc +0 -0
- pytube/__pycache__/cipher.cpython-39.pyc +0 -0
- pytube/__pycache__/exceptions.cpython-310.pyc +0 -0
- pytube/__pycache__/exceptions.cpython-39.pyc +0 -0
- pytube/__pycache__/extract.cpython-310.pyc +0 -0
- pytube/__pycache__/extract.cpython-39.pyc +0 -0
- pytube/__pycache__/helpers.cpython-310.pyc +0 -0
- pytube/__pycache__/helpers.cpython-39.pyc +0 -0
- pytube/__pycache__/innertube.cpython-310.pyc +0 -0
- pytube/__pycache__/innertube.cpython-39.pyc +0 -0
- pytube/__pycache__/itags.cpython-310.pyc +0 -0
- pytube/__pycache__/itags.cpython-39.pyc +0 -0
- pytube/__pycache__/metadata.cpython-310.pyc +0 -0
- pytube/__pycache__/metadata.cpython-39.pyc +0 -0
- pytube/__pycache__/monostate.cpython-310.pyc +0 -0
- pytube/__pycache__/monostate.cpython-39.pyc +0 -0
- pytube/__pycache__/parser.cpython-310.pyc +0 -0
- pytube/__pycache__/parser.cpython-39.pyc +0 -0
- pytube/__pycache__/query.cpython-310.pyc +0 -0
- pytube/__pycache__/query.cpython-39.pyc +0 -0
- pytube/__pycache__/request.cpython-310.pyc +0 -0
- pytube/__pycache__/request.cpython-39.pyc +0 -0
- pytube/__pycache__/streams.cpython-310.pyc +0 -0
- pytube/__pycache__/streams.cpython-39.pyc +0 -0
- pytube/__pycache__/version.cpython-310.pyc +0 -0
- pytube/__pycache__/version.cpython-39.pyc +0 -0
- pytube/captions.py +154 -0
- pytube/cipher.py +697 -0
- pytube/cli.py +560 -0
- pytube/contrib/__init__.py +0 -0
- pytube/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
- pytube/contrib/__pycache__/__init__.cpython-39.pyc +0 -0
app.py
CHANGED
@@ -1,55 +1,51 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
from fastapi import FastAPI
|
4 |
from PIL import Image
|
5 |
-
import
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
"</html>"
|
27 |
-
)
|
28 |
-
|
29 |
-
return HTMLResponse(content=body)
|
30 |
-
|
31 |
-
@app.get("/api")
|
32 |
-
async def cal_api():
|
33 |
images = []
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
images.append(
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
byte_content = open_file.read()
|
43 |
-
base64_bytes = base64.b64encode(byte_content)
|
44 |
-
base64_string = base64_bytes.decode('utf-8')
|
45 |
-
images.append(base64_string)
|
46 |
|
47 |
-
#image_path='lion.jpg'
|
48 |
-
#pilim = Image.open(image_path)
|
49 |
-
#pilimrot = pilim.rotate(45)
|
50 |
-
return {"data": images}
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
|
|
|
|
|
|
1 |
+
import gradio as gr
|
|
|
|
|
2 |
from PIL import Image
|
3 |
+
import os
|
4 |
+
import summarizer as su
|
5 |
+
import nltk
|
6 |
+
|
7 |
+
|
8 |
+
def image_mod(rpunkt_switch, link):
|
9 |
+
|
10 |
+
if len(link)==0:
|
11 |
+
return 'Error: No link provided', None
|
12 |
+
|
13 |
+
nltk_file = 'nltk_data/tokenizers/punkt.zip'
|
14 |
+
home_pc = '/Users/hujo/'
|
15 |
+
home_hf = '/home/user/'
|
16 |
+
if os.path.exists(home_pc+nltk_file) or os.path.exists(home_hf+nltk_file):
|
17 |
+
print('nltk punkt file exists in ', nltk_file)
|
18 |
+
else:
|
19 |
+
nltk.download('punkt')
|
20 |
+
|
21 |
+
#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE'
|
22 |
+
lexrank_switch = True
|
23 |
+
html = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
images = []
|
25 |
+
html, images = su.getSummary(link, lexrank_switch, rpunkt_switch)
|
26 |
+
#images = su.getSummaryImage(link, lexrank_switch, rpunkt_switch)
|
27 |
+
print(html)
|
28 |
+
|
29 |
+
files = os.listdir('workdir/')
|
30 |
+
print('local files: ',files)
|
31 |
|
32 |
+
#image_path = 'workdir/lion.jpg'
|
33 |
+
#im = Image.open(image_path)
|
34 |
+
#images.append(im)
|
35 |
+
#with Image.open(open(image_path,'rb')) as im:
|
36 |
+
# images.append(im)
|
37 |
+
#images.append(im.rotate(90))
|
38 |
+
|
39 |
+
#images[0].save("newlion.png")
|
40 |
+
|
41 |
+
print('images',images)
|
42 |
|
43 |
+
return html, images
|
|
|
|
|
|
|
|
|
44 |
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
demo = gr.Interface(image_mod,
|
47 |
+
[gr.Checkbox(label='Restore runctuation'), "text"] , ["html", gr.Gallery()],
|
48 |
+
allow_flagging="never")
|
49 |
|
50 |
+
if __name__ == "__main__":
|
51 |
+
demo.launch()
|
frames.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ast import Try
|
2 |
+
import subprocess as sp
|
3 |
+
import os
|
4 |
+
|
5 |
+
# show current venv: echo $VIRTUAL_ENV
|
6 |
+
# import sys
|
7 |
+
# del sys.modules['frames']
|
8 |
+
|
9 |
+
# transcript module
|
10 |
+
# 1. extract timestamps from transcript
|
11 |
+
# 2. extract captions from transcript
|
12 |
+
# this module
|
13 |
+
# 3. extract frames at timestamps
|
14 |
+
# 4. add caption to each frame
|
15 |
+
# 5. convert images to mp4 video
|
16 |
+
|
17 |
+
# converts a list of images to a mp4 video
|
18 |
+
def convertImageToVideo():
|
19 |
+
cmd = "ffmpeg -y -f image2 -i frame_%04d.jpg output_video.mp4"
|
20 |
+
cmd_call = cmd.split()
|
21 |
+
working_dir = './workdir'
|
22 |
+
|
23 |
+
with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
|
24 |
+
result = proc.stderr.read()
|
25 |
+
|
26 |
+
return [proc.wait(),result]
|
27 |
+
|
28 |
+
|
29 |
+
# extract a frame as jpg image file
|
30 |
+
# from a video at a given timestamp
|
31 |
+
# num=0; for p in $(cat timestamps); do ((num++)); printf "$num $p\r"; dnum=$(printf "%03d" "$num"); ffmpeg -ss $p -i "$mp4file" -frames:v 1 out_$dnum.jpg >& ffmpeg.out; done
|
32 |
+
def extractImagesFromVideo(timestamps):
|
33 |
+
working_dir = './workdir'
|
34 |
+
input_file = 'input_video.mp4'
|
35 |
+
if not os.path.isfile(working_dir+'/'+input_file):
|
36 |
+
return 'Error: File '+input_file+' is missing, create the file first.'
|
37 |
+
|
38 |
+
|
39 |
+
# create a working directory for the files
|
40 |
+
if not os.path.isdir(working_dir):
|
41 |
+
print('There is no working directory. Create a new one.')
|
42 |
+
os.mkdir(working_dir)
|
43 |
+
|
44 |
+
proc_list = []
|
45 |
+
for current_frame, current_timestamp in enumerate(timestamps, start=1):
|
46 |
+
print(f"{current_frame:04d}", current_timestamp)
|
47 |
+
cmd = 'ffmpeg -y -ss '+str(current_timestamp)+' -i '+input_file+' -frames:v 1 frame_'+f"{current_frame:04d}"+'.jpg'
|
48 |
+
cmd_call = cmd.split()
|
49 |
+
|
50 |
+
with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
|
51 |
+
proc_list.append(proc.wait())
|
52 |
+
|
53 |
+
return proc_list
|
54 |
+
|
55 |
+
# add caption to each image
|
56 |
+
# 'convert' porgram is from the 'imagemagick' package
|
57 |
+
# num=0; while read p; do ((num++)); dnum=$(printf "%03d" "$num"); printf "$dnum $p\r"; convert out_$dnum.jpg -undercolor Black -fill white -gravity South -pointsize 25 -annotate +0+10 "$p" out_$dnum.jpg >& ffmpeg.out; done<srt.txt
|
58 |
+
def addCaptionToImage(caption):
|
59 |
+
proc_list = []
|
60 |
+
for current_frame, current_caption in enumerate(caption.split('\n'), start=1):
|
61 |
+
print(f"{current_frame:04d}", current_caption)
|
62 |
+
#current_frame=182
|
63 |
+
#current_caption='with this method as compared to just'
|
64 |
+
|
65 |
+
cmd = 'convert frame_'+f"{current_frame:04d}"+'.jpg -undercolor Black -fill white -gravity South -pointsize 25 -annotate +0+10'
|
66 |
+
cmd_call = cmd.split()
|
67 |
+
# the 'split' command would also split the input caption
|
68 |
+
# therefore it has to be added to the array after the split
|
69 |
+
cmd_call.append(current_caption)
|
70 |
+
cmd_call.append('frame_'+f"{current_frame:04d}"+'.jpg')
|
71 |
+
#cmd_call
|
72 |
+
working_dir = './workdir'
|
73 |
+
|
74 |
+
with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
|
75 |
+
proc_list.append(proc.wait())
|
76 |
+
|
77 |
+
return proc_list
|
78 |
+
|
79 |
+
|
80 |
+
def removeFilesInWorkdir():
|
81 |
+
result =''
|
82 |
+
working_dir = './workdir'
|
83 |
+
try:
|
84 |
+
for f in os.listdir(working_dir):
|
85 |
+
os.remove(os.path.join(working_dir, f))
|
86 |
+
except:
|
87 |
+
result = 'Error: Not all files could be removed.'
|
88 |
+
|
89 |
+
return result
|
90 |
+
|
91 |
+
def renameOutputVideo(filenme):
|
92 |
+
result = ''
|
93 |
+
working_dir = './workdir'
|
94 |
+
shelf_dir = './shelf'
|
95 |
+
input_filename = working_dir+'/'+'output_video.mp4'
|
96 |
+
output_filename = shelf_dir+'/'+filenme+'.mp4'
|
97 |
+
try:
|
98 |
+
os.rename(input_filename,output_filename)
|
99 |
+
except:
|
100 |
+
result = 'Error: Could not rename file.'
|
101 |
+
|
102 |
+
return result
|
lexrank.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#import nltk
|
2 |
+
#nltk.download('punkt')
|
3 |
+
|
4 |
+
from sumy.parsers.html import HtmlParser
|
5 |
+
from sumy.parsers.plaintext import PlaintextParser
|
6 |
+
from sumy.nlp.tokenizers import Tokenizer
|
7 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
8 |
+
from sumy.nlp.stemmers import Stemmer
|
9 |
+
from sumy.utils import get_stop_words
|
10 |
+
|
11 |
+
def getSummary(text, nr_sentences):
|
12 |
+
summary=[]
|
13 |
+
LANGUAGE = "english"
|
14 |
+
SENTENCES_COUNT = nr_sentences
|
15 |
+
#parser = PlaintextParser.from_file("/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt", Tokenizer(LANGUAGE))
|
16 |
+
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
|
17 |
+
#print(parser.document)
|
18 |
+
stemmer = Stemmer(LANGUAGE)
|
19 |
+
summarizer = LexRankSummarizer(stemmer)
|
20 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
21 |
+
for sentence in summarizer(parser.document, SENTENCES_COUNT):
|
22 |
+
summary.append(sentence)
|
23 |
+
|
24 |
+
return summary
|
myrpunct/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .punctuate import RestorePuncts
|
2 |
+
print("init executed ...")
|
myrpunct/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (231 Bytes). View file
|
|
myrpunct/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (227 Bytes). View file
|
|
myrpunct/__pycache__/punctuate.cpython-310.pyc
ADDED
Binary file (5.71 kB). View file
|
|
myrpunct/__pycache__/punctuate.cpython-39.pyc
ADDED
Binary file (5.69 kB). View file
|
|
myrpunct/punctuate.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# 💾⚙️🔮
|
3 |
+
|
4 |
+
__author__ = "Daulet N."
|
5 |
+
__email__ = "daulet.nurmanbetov@gmail.com"
|
6 |
+
|
7 |
+
import logging
|
8 |
+
from langdetect import detect
|
9 |
+
from simpletransformers.ner import NERModel, NERArgs
|
10 |
+
|
11 |
+
|
12 |
+
class RestorePuncts:
|
13 |
+
def __init__(self, wrds_per_pred=250, use_cuda=False):
|
14 |
+
self.wrds_per_pred = wrds_per_pred
|
15 |
+
self.overlap_wrds = 30
|
16 |
+
self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
|
17 |
+
self.model_hf = "wldmr/felflare-bert-restore-punctuation"
|
18 |
+
self.model_args = NERArgs()
|
19 |
+
self.model_args.silent = True
|
20 |
+
self.model_args.max_seq_length = 512
|
21 |
+
#self.model_args.use_multiprocessing = False
|
22 |
+
self.model = NERModel("bert", self.model_hf, labels=self.valid_labels, use_cuda=use_cuda, args=self.model_args)
|
23 |
+
#self.model = NERModel("bert", self.model_hf, labels=self.valid_labels, use_cuda=use_cuda, args={"silent": True, "max_seq_length": 512, "use_multiprocessing": False})
|
24 |
+
print("class init ...")
|
25 |
+
print("use_multiprocessing: ",self.model_args.use_multiprocessing)
|
26 |
+
|
27 |
+
def status(self):
|
28 |
+
print("function called")
|
29 |
+
|
30 |
+
def punctuate(self, text: str, lang:str=''):
|
31 |
+
"""
|
32 |
+
Performs punctuation restoration on arbitrarily large text.
|
33 |
+
Detects if input is not English, if non-English was detected terminates predictions.
|
34 |
+
Overrride by supplying `lang='en'`
|
35 |
+
|
36 |
+
Args:
|
37 |
+
- text (str): Text to punctuate, can be few words to as large as you want.
|
38 |
+
- lang (str): Explicit language of input text.
|
39 |
+
"""
|
40 |
+
if not lang and len(text) > 10:
|
41 |
+
lang = detect(text)
|
42 |
+
if lang != 'en':
|
43 |
+
raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
|
44 |
+
If you are certain the input is English, pass argument lang='en' to this function.
|
45 |
+
Punctuate received: {text}""")
|
46 |
+
|
47 |
+
# plit up large text into bert digestable chunks
|
48 |
+
splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
|
49 |
+
# predict slices
|
50 |
+
# full_preds_lst contains tuple of labels and logits
|
51 |
+
full_preds_lst = [self.predict(i['text']) for i in splits]
|
52 |
+
# extract predictions, and discard logits
|
53 |
+
preds_lst = [i[0][0] for i in full_preds_lst]
|
54 |
+
# join text slices
|
55 |
+
combined_preds = self.combine_results(text, preds_lst)
|
56 |
+
# create punctuated prediction
|
57 |
+
punct_text = self.punctuate_texts(combined_preds)
|
58 |
+
return punct_text
|
59 |
+
|
60 |
+
def predict(self, input_slice):
|
61 |
+
"""
|
62 |
+
Passes the unpunctuated text to the model for punctuation.
|
63 |
+
"""
|
64 |
+
predictions, raw_outputs = self.model.predict([input_slice])
|
65 |
+
return predictions, raw_outputs
|
66 |
+
|
67 |
+
@staticmethod
|
68 |
+
def split_on_toks(text, length, overlap):
|
69 |
+
"""
|
70 |
+
Splits text into predefined slices of overlapping text with indexes (offsets)
|
71 |
+
that tie-back to original text.
|
72 |
+
This is done to bypass 512 token limit on transformer models by sequentially
|
73 |
+
feeding chunks of < 512 toks.
|
74 |
+
Example output:
|
75 |
+
[{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
|
76 |
+
"""
|
77 |
+
wrds = text.replace('\n', ' ').split(" ")
|
78 |
+
resp = []
|
79 |
+
lst_chunk_idx = 0
|
80 |
+
i = 0
|
81 |
+
|
82 |
+
while True:
|
83 |
+
# words in the chunk and the overlapping portion
|
84 |
+
wrds_len = wrds[(length * i):(length * (i + 1))]
|
85 |
+
wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
|
86 |
+
wrds_split = wrds_len + wrds_ovlp
|
87 |
+
|
88 |
+
# Break loop if no more words
|
89 |
+
if not wrds_split:
|
90 |
+
break
|
91 |
+
|
92 |
+
wrds_str = " ".join(wrds_split)
|
93 |
+
nxt_chunk_start_idx = len(" ".join(wrds_len))
|
94 |
+
lst_char_idx = len(" ".join(wrds_split))
|
95 |
+
|
96 |
+
resp_obj = {
|
97 |
+
"text": wrds_str,
|
98 |
+
"start_idx": lst_chunk_idx,
|
99 |
+
"end_idx": lst_char_idx + lst_chunk_idx,
|
100 |
+
}
|
101 |
+
|
102 |
+
resp.append(resp_obj)
|
103 |
+
lst_chunk_idx += nxt_chunk_start_idx + 1
|
104 |
+
i += 1
|
105 |
+
logging.info(f"Sliced transcript into {len(resp)} slices.")
|
106 |
+
return resp
|
107 |
+
|
108 |
+
@staticmethod
|
109 |
+
def combine_results(full_text: str, text_slices):
|
110 |
+
"""
|
111 |
+
Given a full text and predictions of each slice combines predictions into a single text again.
|
112 |
+
Performs validataion wether text was combined correctly
|
113 |
+
"""
|
114 |
+
split_full_text = full_text.replace('\n', ' ').split(" ")
|
115 |
+
split_full_text = [i for i in split_full_text if i]
|
116 |
+
split_full_text_len = len(split_full_text)
|
117 |
+
output_text = []
|
118 |
+
index = 0
|
119 |
+
|
120 |
+
if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
|
121 |
+
text_slices = text_slices[:-1]
|
122 |
+
|
123 |
+
for _slice in text_slices:
|
124 |
+
slice_wrds = len(_slice)
|
125 |
+
for ix, wrd in enumerate(_slice):
|
126 |
+
# print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
|
127 |
+
if index == split_full_text_len:
|
128 |
+
break
|
129 |
+
|
130 |
+
if split_full_text[index] == str(list(wrd.keys())[0]) and \
|
131 |
+
ix <= slice_wrds - 3 and text_slices[-1] != _slice:
|
132 |
+
index += 1
|
133 |
+
pred_item_tuple = list(wrd.items())[0]
|
134 |
+
output_text.append(pred_item_tuple)
|
135 |
+
elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
|
136 |
+
index += 1
|
137 |
+
pred_item_tuple = list(wrd.items())[0]
|
138 |
+
output_text.append(pred_item_tuple)
|
139 |
+
assert [i[0] for i in output_text] == split_full_text
|
140 |
+
return output_text
|
141 |
+
|
142 |
+
@staticmethod
|
143 |
+
def punctuate_texts(full_pred: list):
|
144 |
+
"""
|
145 |
+
Given a list of Predictions from the model, applies the predictions to text,
|
146 |
+
thus punctuating it.
|
147 |
+
"""
|
148 |
+
punct_resp = ""
|
149 |
+
for i in full_pred:
|
150 |
+
word, label = i
|
151 |
+
if label[-1] == "U":
|
152 |
+
punct_wrd = word.capitalize()
|
153 |
+
else:
|
154 |
+
punct_wrd = word
|
155 |
+
|
156 |
+
if label[0] != "O":
|
157 |
+
punct_wrd += label[0]
|
158 |
+
|
159 |
+
punct_resp += punct_wrd + " "
|
160 |
+
punct_resp = punct_resp.strip()
|
161 |
+
# Append trailing period if doesnt exist.
|
162 |
+
if punct_resp[-1].isalnum():
|
163 |
+
punct_resp += "."
|
164 |
+
return punct_resp
|
165 |
+
|
166 |
+
|
167 |
+
if __name__ == "__main__":
|
168 |
+
punct_model = RestorePuncts()
|
169 |
+
# read test file
|
170 |
+
with open('../tests/sample_text.txt', 'r') as fp:
|
171 |
+
test_sample = fp.read()
|
172 |
+
# predict text and print
|
173 |
+
punctuated = punct_model.punctuate(test_sample)
|
174 |
+
print(punctuated)
|
myrpunct/utils.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# 💾⚙️🔮
|
3 |
+
|
4 |
+
__author__ = "Daulet N."
|
5 |
+
__email__ = "daulet.nurmanbetov@gmail.com"
|
6 |
+
|
7 |
+
def prepare_unpunct_text(text):
|
8 |
+
"""
|
9 |
+
Given a text, normalizes it to subsequently restore punctuation
|
10 |
+
"""
|
11 |
+
formatted_txt = text.replace('\n', '').strip()
|
12 |
+
formatted_txt = formatted_txt.lower()
|
13 |
+
formatted_txt_lst = formatted_txt.split(" ")
|
14 |
+
punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
|
15 |
+
normalized_txt = " ".join([i for i in punct_strp_txt if i])
|
16 |
+
return normalized_txt
|
17 |
+
|
18 |
+
def strip_punct(wrd):
|
19 |
+
"""
|
20 |
+
Given a word, strips non aphanumeric characters that precede and follow it
|
21 |
+
"""
|
22 |
+
if not wrd:
|
23 |
+
return wrd
|
24 |
+
|
25 |
+
while not wrd[-1:].isalnum():
|
26 |
+
if not wrd:
|
27 |
+
break
|
28 |
+
wrd = wrd[:-1]
|
29 |
+
|
30 |
+
while not wrd[:1].isalnum():
|
31 |
+
if not wrd:
|
32 |
+
break
|
33 |
+
wrd = wrd[1:]
|
34 |
+
return wrd
|
pytube/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa: F401
|
2 |
+
# noreorder
|
3 |
+
"""
|
4 |
+
Pytube: a very serious Python library for downloading YouTube Videos.
|
5 |
+
"""
|
6 |
+
__title__ = "pytube"
|
7 |
+
__author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano"
|
8 |
+
__license__ = "The Unlicense (Unlicense)"
|
9 |
+
__js__ = None
|
10 |
+
__js_url__ = None
|
11 |
+
|
12 |
+
from pytube.version import __version__
|
13 |
+
from pytube.streams import Stream
|
14 |
+
from pytube.captions import Caption
|
15 |
+
from pytube.query import CaptionQuery, StreamQuery
|
16 |
+
from pytube.__main__ import YouTube
|
17 |
+
from pytube.contrib.playlist import Playlist
|
18 |
+
from pytube.contrib.channel import Channel
|
19 |
+
from pytube.contrib.search import Search
|
pytube/__main__.py
ADDED
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module implements the core developer interface for pytube.
|
3 |
+
|
4 |
+
The problem domain of the :class:`YouTube <YouTube> class focuses almost
|
5 |
+
exclusively on the developer interface. Pytube offloads the heavy lifting to
|
6 |
+
smaller peripheral modules and functions.
|
7 |
+
|
8 |
+
"""
|
9 |
+
import logging
|
10 |
+
from typing import Any, Callable, Dict, List, Optional
|
11 |
+
|
12 |
+
import pytube
|
13 |
+
import pytube.exceptions as exceptions
|
14 |
+
from pytube import extract, request
|
15 |
+
from pytube import Stream, StreamQuery
|
16 |
+
from pytube.helpers import install_proxy
|
17 |
+
from pytube.innertube import InnerTube
|
18 |
+
from pytube.metadata import YouTubeMetadata
|
19 |
+
from pytube.monostate import Monostate
|
20 |
+
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
+
|
23 |
+
|
24 |
+
class YouTube:
|
25 |
+
"""Core developer interface for pytube."""
|
26 |
+
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
url: str,
|
30 |
+
on_progress_callback: Optional[Callable[[Any, bytes, int], None]] = None,
|
31 |
+
on_complete_callback: Optional[Callable[[Any, Optional[str]], None]] = None,
|
32 |
+
proxies: Dict[str, str] = None,
|
33 |
+
use_oauth: bool = False,
|
34 |
+
allow_oauth_cache: bool = True
|
35 |
+
):
|
36 |
+
"""Construct a :class:`YouTube <YouTube>`.
|
37 |
+
|
38 |
+
:param str url:
|
39 |
+
A valid YouTube watch URL.
|
40 |
+
:param func on_progress_callback:
|
41 |
+
(Optional) User defined callback function for stream download
|
42 |
+
progress events.
|
43 |
+
:param func on_complete_callback:
|
44 |
+
(Optional) User defined callback function for stream download
|
45 |
+
complete events.
|
46 |
+
:param dict proxies:
|
47 |
+
(Optional) A dict mapping protocol to proxy address which will be used by pytube.
|
48 |
+
:param bool use_oauth:
|
49 |
+
(Optional) Prompt the user to authenticate to YouTube.
|
50 |
+
If allow_oauth_cache is set to True, the user should only be prompted once.
|
51 |
+
:param bool allow_oauth_cache:
|
52 |
+
(Optional) Cache OAuth tokens locally on the machine. Defaults to True.
|
53 |
+
These tokens are only generated if use_oauth is set to True as well.
|
54 |
+
"""
|
55 |
+
self._js: Optional[str] = None # js fetched by js_url
|
56 |
+
self._js_url: Optional[str] = None # the url to the js, parsed from watch html
|
57 |
+
|
58 |
+
self._vid_info: Optional[Dict] = None # content fetched from innertube/player
|
59 |
+
|
60 |
+
self._watch_html: Optional[str] = None # the html of /watch?v=<video_id>
|
61 |
+
self._embed_html: Optional[str] = None
|
62 |
+
self._player_config_args: Optional[Dict] = None # inline js in the html containing
|
63 |
+
self._age_restricted: Optional[bool] = None
|
64 |
+
|
65 |
+
self._fmt_streams: Optional[List[Stream]] = None
|
66 |
+
|
67 |
+
self._initial_data = None
|
68 |
+
self._metadata: Optional[YouTubeMetadata] = None
|
69 |
+
|
70 |
+
# video_id part of /watch?v=<video_id>
|
71 |
+
self.video_id = extract.video_id(url)
|
72 |
+
|
73 |
+
self.watch_url = f"https://youtube.com/watch?v={self.video_id}"
|
74 |
+
self.embed_url = f"https://www.youtube.com/embed/{self.video_id}"
|
75 |
+
|
76 |
+
# Shared between all instances of `Stream` (Borg pattern).
|
77 |
+
self.stream_monostate = Monostate(
|
78 |
+
on_progress=on_progress_callback, on_complete=on_complete_callback
|
79 |
+
)
|
80 |
+
|
81 |
+
if proxies:
|
82 |
+
install_proxy(proxies)
|
83 |
+
|
84 |
+
self._author = None
|
85 |
+
self._title = None
|
86 |
+
self._publish_date = None
|
87 |
+
|
88 |
+
self.use_oauth = use_oauth
|
89 |
+
self.allow_oauth_cache = allow_oauth_cache
|
90 |
+
|
91 |
+
def __repr__(self):
|
92 |
+
return f'<pytube.__main__.YouTube object: videoId={self.video_id}>'
|
93 |
+
|
94 |
+
def __eq__(self, o: object) -> bool:
|
95 |
+
# Compare types and urls, if they're same return true, else return false.
|
96 |
+
return type(o) == type(self) and o.watch_url == self.watch_url
|
97 |
+
|
98 |
+
@property
|
99 |
+
def watch_html(self):
|
100 |
+
if self._watch_html:
|
101 |
+
return self._watch_html
|
102 |
+
self._watch_html = request.get(url=self.watch_url)
|
103 |
+
return self._watch_html
|
104 |
+
|
105 |
+
@property
|
106 |
+
def embed_html(self):
|
107 |
+
if self._embed_html:
|
108 |
+
return self._embed_html
|
109 |
+
self._embed_html = request.get(url=self.embed_url)
|
110 |
+
return self._embed_html
|
111 |
+
|
112 |
+
@property
|
113 |
+
def age_restricted(self):
|
114 |
+
if self._age_restricted:
|
115 |
+
return self._age_restricted
|
116 |
+
self._age_restricted = extract.is_age_restricted(self.watch_html)
|
117 |
+
return self._age_restricted
|
118 |
+
|
119 |
+
@property
|
120 |
+
def js_url(self):
|
121 |
+
if self._js_url:
|
122 |
+
return self._js_url
|
123 |
+
|
124 |
+
if self.age_restricted:
|
125 |
+
self._js_url = extract.js_url(self.embed_html)
|
126 |
+
else:
|
127 |
+
self._js_url = extract.js_url(self.watch_html)
|
128 |
+
|
129 |
+
return self._js_url
|
130 |
+
|
131 |
+
@property
|
132 |
+
def js(self):
|
133 |
+
if self._js:
|
134 |
+
return self._js
|
135 |
+
|
136 |
+
# If the js_url doesn't match the cached url, fetch the new js and update
|
137 |
+
# the cache; otherwise, load the cache.
|
138 |
+
if pytube.__js_url__ != self.js_url:
|
139 |
+
self._js = request.get(self.js_url)
|
140 |
+
pytube.__js__ = self._js
|
141 |
+
pytube.__js_url__ = self.js_url
|
142 |
+
else:
|
143 |
+
self._js = pytube.__js__
|
144 |
+
|
145 |
+
return self._js
|
146 |
+
|
147 |
+
@property
|
148 |
+
def initial_data(self):
|
149 |
+
if self._initial_data:
|
150 |
+
return self._initial_data
|
151 |
+
self._initial_data = extract.initial_data(self.watch_html)
|
152 |
+
return self._initial_data
|
153 |
+
|
154 |
+
@property
|
155 |
+
def streaming_data(self):
|
156 |
+
"""Return streamingData from video info."""
|
157 |
+
if 'streamingData' in self.vid_info:
|
158 |
+
return self.vid_info['streamingData']
|
159 |
+
else:
|
160 |
+
self.bypass_age_gate()
|
161 |
+
return self.vid_info['streamingData']
|
162 |
+
|
163 |
+
@property
|
164 |
+
def fmt_streams(self):
|
165 |
+
"""Returns a list of streams if they have been initialized.
|
166 |
+
|
167 |
+
If the streams have not been initialized, finds all relevant
|
168 |
+
streams and initializes them.
|
169 |
+
"""
|
170 |
+
self.check_availability()
|
171 |
+
if self._fmt_streams:
|
172 |
+
return self._fmt_streams
|
173 |
+
|
174 |
+
self._fmt_streams = []
|
175 |
+
|
176 |
+
stream_manifest = extract.apply_descrambler(self.streaming_data)
|
177 |
+
|
178 |
+
# If the cached js doesn't work, try fetching a new js file
|
179 |
+
# https://github.com/pytube/pytube/issues/1054
|
180 |
+
try:
|
181 |
+
extract.apply_signature(stream_manifest, self.vid_info, self.js)
|
182 |
+
except exceptions.ExtractError:
|
183 |
+
# To force an update to the js file, we clear the cache and retry
|
184 |
+
self._js = None
|
185 |
+
self._js_url = None
|
186 |
+
pytube.__js__ = None
|
187 |
+
pytube.__js_url__ = None
|
188 |
+
extract.apply_signature(stream_manifest, self.vid_info, self.js)
|
189 |
+
|
190 |
+
# build instances of :class:`Stream <Stream>`
|
191 |
+
# Initialize stream objects
|
192 |
+
for stream in stream_manifest:
|
193 |
+
video = Stream(
|
194 |
+
stream=stream,
|
195 |
+
monostate=self.stream_monostate,
|
196 |
+
)
|
197 |
+
self._fmt_streams.append(video)
|
198 |
+
|
199 |
+
self.stream_monostate.title = self.title
|
200 |
+
self.stream_monostate.duration = self.length
|
201 |
+
|
202 |
+
return self._fmt_streams
|
203 |
+
|
204 |
+
def check_availability(self):
|
205 |
+
"""Check whether the video is available.
|
206 |
+
|
207 |
+
Raises different exceptions based on why the video is unavailable,
|
208 |
+
otherwise does nothing.
|
209 |
+
"""
|
210 |
+
status, messages = extract.playability_status(self.watch_html)
|
211 |
+
|
212 |
+
for reason in messages:
|
213 |
+
if status == 'UNPLAYABLE':
|
214 |
+
if reason == (
|
215 |
+
'Join this channel to get access to members-only content '
|
216 |
+
'like this video, and other exclusive perks.'
|
217 |
+
):
|
218 |
+
raise exceptions.MembersOnly(video_id=self.video_id)
|
219 |
+
elif reason == 'This live stream recording is not available.':
|
220 |
+
raise exceptions.RecordingUnavailable(video_id=self.video_id)
|
221 |
+
else:
|
222 |
+
raise exceptions.VideoUnavailable(video_id=self.video_id)
|
223 |
+
elif status == 'LOGIN_REQUIRED':
|
224 |
+
if reason == (
|
225 |
+
'This is a private video. '
|
226 |
+
'Please sign in to verify that you may see it.'
|
227 |
+
):
|
228 |
+
raise exceptions.VideoPrivate(video_id=self.video_id)
|
229 |
+
elif status == 'ERROR':
|
230 |
+
if reason == 'Video unavailable':
|
231 |
+
raise exceptions.VideoUnavailable(video_id=self.video_id)
|
232 |
+
elif status == 'LIVE_STREAM':
|
233 |
+
raise exceptions.LiveStreamError(video_id=self.video_id)
|
234 |
+
|
235 |
+
@property
|
236 |
+
def vid_info(self):
|
237 |
+
"""Parse the raw vid info and return the parsed result.
|
238 |
+
|
239 |
+
:rtype: Dict[Any, Any]
|
240 |
+
"""
|
241 |
+
if self._vid_info:
|
242 |
+
return self._vid_info
|
243 |
+
|
244 |
+
innertube = InnerTube(use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache)
|
245 |
+
|
246 |
+
innertube_response = innertube.player(self.video_id)
|
247 |
+
self._vid_info = innertube_response
|
248 |
+
return self._vid_info
|
249 |
+
|
250 |
+
def bypass_age_gate(self):
|
251 |
+
"""Attempt to update the vid_info by bypassing the age gate."""
|
252 |
+
innertube = InnerTube(
|
253 |
+
client='ANDROID_EMBED',
|
254 |
+
use_oauth=self.use_oauth,
|
255 |
+
allow_cache=self.allow_oauth_cache
|
256 |
+
)
|
257 |
+
innertube_response = innertube.player(self.video_id)
|
258 |
+
|
259 |
+
playability_status = innertube_response['playabilityStatus'].get('status', None)
|
260 |
+
|
261 |
+
# If we still can't access the video, raise an exception
|
262 |
+
# (tier 3 age restriction)
|
263 |
+
if playability_status == 'UNPLAYABLE':
|
264 |
+
raise exceptions.AgeRestrictedError(self.video_id)
|
265 |
+
|
266 |
+
self._vid_info = innertube_response
|
267 |
+
|
268 |
+
@property
|
269 |
+
def caption_tracks(self) -> List[pytube.Caption]:
|
270 |
+
"""Get a list of :class:`Caption <Caption>`.
|
271 |
+
|
272 |
+
:rtype: List[Caption]
|
273 |
+
"""
|
274 |
+
raw_tracks = (
|
275 |
+
self.vid_info.get("captions", {})
|
276 |
+
.get("playerCaptionsTracklistRenderer", {})
|
277 |
+
.get("captionTracks", [])
|
278 |
+
)
|
279 |
+
return [pytube.Caption(track) for track in raw_tracks]
|
280 |
+
|
281 |
+
@property
|
282 |
+
def captions(self) -> pytube.CaptionQuery:
|
283 |
+
"""Interface to query caption tracks.
|
284 |
+
|
285 |
+
:rtype: :class:`CaptionQuery <CaptionQuery>`.
|
286 |
+
"""
|
287 |
+
return pytube.CaptionQuery(self.caption_tracks)
|
288 |
+
|
289 |
+
@property
|
290 |
+
def streams(self) -> StreamQuery:
|
291 |
+
"""Interface to query both adaptive (DASH) and progressive streams.
|
292 |
+
|
293 |
+
:rtype: :class:`StreamQuery <StreamQuery>`.
|
294 |
+
"""
|
295 |
+
self.check_availability()
|
296 |
+
return StreamQuery(self.fmt_streams)
|
297 |
+
|
298 |
+
@property
|
299 |
+
def thumbnail_url(self) -> str:
|
300 |
+
"""Get the thumbnail url image.
|
301 |
+
|
302 |
+
:rtype: str
|
303 |
+
"""
|
304 |
+
thumbnail_details = (
|
305 |
+
self.vid_info.get("videoDetails", {})
|
306 |
+
.get("thumbnail", {})
|
307 |
+
.get("thumbnails")
|
308 |
+
)
|
309 |
+
if thumbnail_details:
|
310 |
+
thumbnail_details = thumbnail_details[-1] # last item has max size
|
311 |
+
return thumbnail_details["url"]
|
312 |
+
|
313 |
+
return f"https://img.youtube.com/vi/{self.video_id}/maxresdefault.jpg"
|
314 |
+
|
315 |
+
@property
|
316 |
+
def publish_date(self):
|
317 |
+
"""Get the publish date.
|
318 |
+
|
319 |
+
:rtype: datetime
|
320 |
+
"""
|
321 |
+
if self._publish_date:
|
322 |
+
return self._publish_date
|
323 |
+
self._publish_date = extract.publish_date(self.watch_html)
|
324 |
+
return self._publish_date
|
325 |
+
|
326 |
+
@publish_date.setter
|
327 |
+
def publish_date(self, value):
|
328 |
+
"""Sets the publish date."""
|
329 |
+
self._publish_date = value
|
330 |
+
|
331 |
+
@property
|
332 |
+
def title(self) -> str:
|
333 |
+
"""Get the video title.
|
334 |
+
|
335 |
+
:rtype: str
|
336 |
+
"""
|
337 |
+
if self._title:
|
338 |
+
return self._title
|
339 |
+
|
340 |
+
try:
|
341 |
+
self._title = self.vid_info['videoDetails']['title']
|
342 |
+
except KeyError:
|
343 |
+
# Check_availability will raise the correct exception in most cases
|
344 |
+
# if it doesn't, ask for a report.
|
345 |
+
self.check_availability()
|
346 |
+
raise exceptions.PytubeError(
|
347 |
+
(
|
348 |
+
f'Exception while accessing title of {self.watch_url}. '
|
349 |
+
'Please file a bug report at https://github.com/pytube/pytube'
|
350 |
+
)
|
351 |
+
)
|
352 |
+
|
353 |
+
return self._title
|
354 |
+
|
355 |
+
@title.setter
|
356 |
+
def title(self, value):
|
357 |
+
"""Sets the title value."""
|
358 |
+
self._title = value
|
359 |
+
|
360 |
+
@property
|
361 |
+
def description(self) -> str:
|
362 |
+
"""Get the video description.
|
363 |
+
|
364 |
+
:rtype: str
|
365 |
+
"""
|
366 |
+
return self.vid_info.get("videoDetails", {}).get("shortDescription")
|
367 |
+
|
368 |
+
@property
|
369 |
+
def rating(self) -> float:
|
370 |
+
"""Get the video average rating.
|
371 |
+
|
372 |
+
:rtype: float
|
373 |
+
|
374 |
+
"""
|
375 |
+
return self.vid_info.get("videoDetails", {}).get("averageRating")
|
376 |
+
|
377 |
+
@property
|
378 |
+
def length(self) -> int:
|
379 |
+
"""Get the video length in seconds.
|
380 |
+
|
381 |
+
:rtype: int
|
382 |
+
"""
|
383 |
+
return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds'))
|
384 |
+
|
385 |
+
@property
|
386 |
+
def views(self) -> int:
|
387 |
+
"""Get the number of the times the video has been viewed.
|
388 |
+
|
389 |
+
:rtype: int
|
390 |
+
"""
|
391 |
+
return int(self.vid_info.get("videoDetails", {}).get("viewCount"))
|
392 |
+
|
393 |
+
@property
|
394 |
+
def author(self) -> str:
|
395 |
+
"""Get the video author.
|
396 |
+
:rtype: str
|
397 |
+
"""
|
398 |
+
if self._author:
|
399 |
+
return self._author
|
400 |
+
self._author = self.vid_info.get("videoDetails", {}).get(
|
401 |
+
"author", "unknown"
|
402 |
+
)
|
403 |
+
return self._author
|
404 |
+
|
405 |
+
@author.setter
|
406 |
+
def author(self, value):
|
407 |
+
"""Set the video author."""
|
408 |
+
self._author = value
|
409 |
+
|
410 |
+
@property
|
411 |
+
def keywords(self) -> List[str]:
|
412 |
+
"""Get the video keywords.
|
413 |
+
|
414 |
+
:rtype: List[str]
|
415 |
+
"""
|
416 |
+
return self.vid_info.get('videoDetails', {}).get('keywords', [])
|
417 |
+
|
418 |
+
@property
|
419 |
+
def channel_id(self) -> str:
|
420 |
+
"""Get the video poster's channel id.
|
421 |
+
|
422 |
+
:rtype: str
|
423 |
+
"""
|
424 |
+
return self.vid_info.get('videoDetails', {}).get('channelId', None)
|
425 |
+
|
426 |
+
@property
|
427 |
+
def channel_url(self) -> str:
|
428 |
+
"""Construct the channel url for the video's poster from the channel id.
|
429 |
+
|
430 |
+
:rtype: str
|
431 |
+
"""
|
432 |
+
return f'https://www.youtube.com/channel/{self.channel_id}'
|
433 |
+
|
434 |
+
@property
|
435 |
+
def metadata(self) -> Optional[YouTubeMetadata]:
|
436 |
+
"""Get the metadata for the video.
|
437 |
+
|
438 |
+
:rtype: YouTubeMetadata
|
439 |
+
"""
|
440 |
+
if self._metadata:
|
441 |
+
return self._metadata
|
442 |
+
else:
|
443 |
+
self._metadata = extract.metadata(self.initial_data)
|
444 |
+
return self._metadata
|
445 |
+
|
446 |
+
def register_on_progress_callback(self, func: Callable[[Any, bytes, int], None]):
|
447 |
+
"""Register a download progress callback function post initialization.
|
448 |
+
|
449 |
+
:param callable func:
|
450 |
+
A callback function that takes ``stream``, ``chunk``,
|
451 |
+
and ``bytes_remaining`` as parameters.
|
452 |
+
|
453 |
+
:rtype: None
|
454 |
+
|
455 |
+
"""
|
456 |
+
self.stream_monostate.on_progress = func
|
457 |
+
|
458 |
+
def register_on_complete_callback(self, func: Callable[[Any, Optional[str]], None]):
|
459 |
+
"""Register a download complete callback function post initialization.
|
460 |
+
|
461 |
+
:param callable func:
|
462 |
+
A callback function that takes ``stream`` and ``file_path``.
|
463 |
+
|
464 |
+
:rtype: None
|
465 |
+
|
466 |
+
"""
|
467 |
+
self.stream_monostate.on_complete = func
|
pytube/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (830 Bytes). View file
|
|
pytube/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (819 Bytes). View file
|
|
pytube/__pycache__/__main__.cpython-310.pyc
ADDED
Binary file (12.6 kB). View file
|
|
pytube/__pycache__/__main__.cpython-39.pyc
ADDED
Binary file (12.8 kB). View file
|
|
pytube/__pycache__/captions.cpython-310.pyc
ADDED
Binary file (4.95 kB). View file
|
|
pytube/__pycache__/captions.cpython-39.pyc
ADDED
Binary file (4.92 kB). View file
|
|
pytube/__pycache__/cipher.cpython-310.pyc
ADDED
Binary file (18.9 kB). View file
|
|
pytube/__pycache__/cipher.cpython-39.pyc
ADDED
Binary file (18.9 kB). View file
|
|
pytube/__pycache__/exceptions.cpython-310.pyc
ADDED
Binary file (5.01 kB). View file
|
|
pytube/__pycache__/exceptions.cpython-39.pyc
ADDED
Binary file (5.55 kB). View file
|
|
pytube/__pycache__/extract.cpython-310.pyc
ADDED
Binary file (15.4 kB). View file
|
|
pytube/__pycache__/extract.cpython-39.pyc
ADDED
Binary file (15.5 kB). View file
|
|
pytube/__pycache__/helpers.cpython-310.pyc
ADDED
Binary file (9.83 kB). View file
|
|
pytube/__pycache__/helpers.cpython-39.pyc
ADDED
Binary file (9.84 kB). View file
|
|
pytube/__pycache__/innertube.cpython-310.pyc
ADDED
Binary file (8.63 kB). View file
|
|
pytube/__pycache__/innertube.cpython-39.pyc
ADDED
Binary file (8.63 kB). View file
|
|
pytube/__pycache__/itags.cpython-310.pyc
ADDED
Binary file (2.78 kB). View file
|
|
pytube/__pycache__/itags.cpython-39.pyc
ADDED
Binary file (2.26 kB). View file
|
|
pytube/__pycache__/metadata.cpython-310.pyc
ADDED
Binary file (1.71 kB). View file
|
|
pytube/__pycache__/metadata.cpython-39.pyc
ADDED
Binary file (1.71 kB). View file
|
|
pytube/__pycache__/monostate.cpython-310.pyc
ADDED
Binary file (728 Bytes). View file
|
|
pytube/__pycache__/monostate.cpython-39.pyc
ADDED
Binary file (695 Bytes). View file
|
|
pytube/__pycache__/parser.cpython-310.pyc
ADDED
Binary file (3.94 kB). View file
|
|
pytube/__pycache__/parser.cpython-39.pyc
ADDED
Binary file (3.93 kB). View file
|
|
pytube/__pycache__/query.cpython-310.pyc
ADDED
Binary file (14.1 kB). View file
|
|
pytube/__pycache__/query.cpython-39.pyc
ADDED
Binary file (14.3 kB). View file
|
|
pytube/__pycache__/request.cpython-310.pyc
ADDED
Binary file (5.74 kB). View file
|
|
pytube/__pycache__/request.cpython-39.pyc
ADDED
Binary file (5.69 kB). View file
|
|
pytube/__pycache__/streams.cpython-310.pyc
ADDED
Binary file (10.9 kB). View file
|
|
pytube/__pycache__/streams.cpython-39.pyc
ADDED
Binary file (10.8 kB). View file
|
|
pytube/__pycache__/version.cpython-310.pyc
ADDED
Binary file (220 Bytes). View file
|
|
pytube/__pycache__/version.cpython-39.pyc
ADDED
Binary file (214 Bytes). View file
|
|
pytube/captions.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import xml.etree.ElementTree as ElementTree
|
5 |
+
from html import unescape
|
6 |
+
from typing import Dict, Optional
|
7 |
+
|
8 |
+
from pytube import request
|
9 |
+
from pytube.helpers import safe_filename, target_directory
|
10 |
+
|
11 |
+
|
12 |
+
class Caption:
|
13 |
+
"""Container for caption tracks."""
|
14 |
+
|
15 |
+
def __init__(self, caption_track: Dict):
|
16 |
+
"""Construct a :class:`Caption <Caption>`.
|
17 |
+
|
18 |
+
:param dict caption_track:
|
19 |
+
Caption track data extracted from ``watch_html``.
|
20 |
+
"""
|
21 |
+
self.url = caption_track.get("baseUrl")
|
22 |
+
|
23 |
+
# Certain videos have runs instead of simpleText
|
24 |
+
# this handles that edge case
|
25 |
+
name_dict = caption_track['name']
|
26 |
+
if 'simpleText' in name_dict:
|
27 |
+
self.name = name_dict['simpleText']
|
28 |
+
else:
|
29 |
+
for el in name_dict['runs']:
|
30 |
+
if 'text' in el:
|
31 |
+
self.name = el['text']
|
32 |
+
|
33 |
+
# Use "vssId" instead of "languageCode", fix issue #779
|
34 |
+
self.code = caption_track["vssId"]
|
35 |
+
# Remove preceding '.' for backwards compatibility, e.g.:
|
36 |
+
# English -> vssId: .en, languageCode: en
|
37 |
+
# English (auto-generated) -> vssId: a.en, languageCode: en
|
38 |
+
self.code = self.code.strip('.')
|
39 |
+
|
40 |
+
@property
|
41 |
+
def xml_captions(self) -> str:
|
42 |
+
"""Download the xml caption tracks."""
|
43 |
+
return request.get(self.url)
|
44 |
+
|
45 |
+
def generate_srt_captions(self) -> str:
|
46 |
+
"""Generate "SubRip Subtitle" captions.
|
47 |
+
|
48 |
+
Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
|
49 |
+
recompiles them into the "SubRip Subtitle" format.
|
50 |
+
"""
|
51 |
+
return self.xml_caption_to_srt(self.xml_captions)
|
52 |
+
|
53 |
+
@staticmethod
|
54 |
+
def float_to_srt_time_format(d: float) -> str:
|
55 |
+
"""Convert decimal durations into proper srt format.
|
56 |
+
|
57 |
+
:rtype: str
|
58 |
+
:returns:
|
59 |
+
SubRip Subtitle (str) formatted time duration.
|
60 |
+
|
61 |
+
float_to_srt_time_format(3.89) -> '00:00:03,890'
|
62 |
+
"""
|
63 |
+
fraction, whole = math.modf(d)
|
64 |
+
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
|
65 |
+
ms = f"{fraction:.3f}".replace("0.", "")
|
66 |
+
return time_fmt + ms
|
67 |
+
|
68 |
+
def xml_caption_to_srt(self, xml_captions: str) -> str:
|
69 |
+
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
70 |
+
|
71 |
+
:param str xml_captions:
|
72 |
+
XML formatted caption tracks.
|
73 |
+
"""
|
74 |
+
segments = []
|
75 |
+
root = ElementTree.fromstring(xml_captions)
|
76 |
+
for i, child in enumerate(list(root)):
|
77 |
+
text = child.text or ""
|
78 |
+
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
|
79 |
+
try:
|
80 |
+
duration = float(child.attrib["dur"])
|
81 |
+
except KeyError:
|
82 |
+
duration = 0.0
|
83 |
+
start = float(child.attrib["start"])
|
84 |
+
end = start + duration
|
85 |
+
sequence_number = i + 1 # convert from 0-indexed to 1.
|
86 |
+
line = "{seq}\n{start} --> {end}\n{text}\n".format(
|
87 |
+
seq=sequence_number,
|
88 |
+
start=self.float_to_srt_time_format(start),
|
89 |
+
end=self.float_to_srt_time_format(end),
|
90 |
+
text=caption,
|
91 |
+
)
|
92 |
+
segments.append(line)
|
93 |
+
return "\n".join(segments).strip()
|
94 |
+
|
95 |
+
def download(
|
96 |
+
self,
|
97 |
+
title: str,
|
98 |
+
srt: bool = True,
|
99 |
+
output_path: Optional[str] = None,
|
100 |
+
filename_prefix: Optional[str] = None,
|
101 |
+
) -> str:
|
102 |
+
"""Write the media stream to disk.
|
103 |
+
|
104 |
+
:param title:
|
105 |
+
Output filename (stem only) for writing media file.
|
106 |
+
If one is not specified, the default filename is used.
|
107 |
+
:type title: str
|
108 |
+
:param srt:
|
109 |
+
Set to True to download srt, false to download xml. Defaults to True.
|
110 |
+
:type srt bool
|
111 |
+
:param output_path:
|
112 |
+
(optional) Output path for writing media file. If one is not
|
113 |
+
specified, defaults to the current working directory.
|
114 |
+
:type output_path: str or None
|
115 |
+
:param filename_prefix:
|
116 |
+
(optional) A string that will be prepended to the filename.
|
117 |
+
For example a number in a playlist or the name of a series.
|
118 |
+
If one is not specified, nothing will be prepended
|
119 |
+
This is separate from filename so you can use the default
|
120 |
+
filename but still add a prefix.
|
121 |
+
:type filename_prefix: str or None
|
122 |
+
|
123 |
+
:rtype: str
|
124 |
+
"""
|
125 |
+
if title.endswith(".srt") or title.endswith(".xml"):
|
126 |
+
filename = ".".join(title.split(".")[:-1])
|
127 |
+
else:
|
128 |
+
filename = title
|
129 |
+
|
130 |
+
if filename_prefix:
|
131 |
+
filename = f"{safe_filename(filename_prefix)}{filename}"
|
132 |
+
|
133 |
+
filename = safe_filename(filename)
|
134 |
+
|
135 |
+
filename += f" ({self.code})"
|
136 |
+
|
137 |
+
if srt:
|
138 |
+
filename += ".srt"
|
139 |
+
else:
|
140 |
+
filename += ".xml"
|
141 |
+
|
142 |
+
file_path = os.path.join(target_directory(output_path), filename)
|
143 |
+
|
144 |
+
with open(file_path, "w", encoding="utf-8") as file_handle:
|
145 |
+
if srt:
|
146 |
+
file_handle.write(self.generate_srt_captions())
|
147 |
+
else:
|
148 |
+
file_handle.write(self.xml_captions)
|
149 |
+
|
150 |
+
return file_path
|
151 |
+
|
152 |
+
def __repr__(self):
|
153 |
+
"""Printable object representation."""
|
154 |
+
return '<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
|
pytube/cipher.py
ADDED
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains all logic necessary to decipher the signature.
|
3 |
+
|
4 |
+
YouTube's strategy to restrict downloading videos is to send a ciphered version
|
5 |
+
of the signature to the client, along with the decryption algorithm obfuscated
|
6 |
+
in JavaScript. For the clients to play the videos, JavaScript must take the
|
7 |
+
ciphered version, cycle it through a series of "transform functions," and then
|
8 |
+
signs the media URL with the output.
|
9 |
+
|
10 |
+
This module is responsible for (1) finding and extracting those "transform
|
11 |
+
functions" (2) maps them to Python equivalents and (3) taking the ciphered
|
12 |
+
signature and decoding it.
|
13 |
+
|
14 |
+
"""
|
15 |
+
import logging
|
16 |
+
import re
|
17 |
+
from itertools import chain
|
18 |
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
19 |
+
|
20 |
+
from pytube.exceptions import ExtractError, RegexMatchError
|
21 |
+
from pytube.helpers import cache, regex_search
|
22 |
+
from pytube.parser import find_object_from_startpoint, throttling_array_split
|
23 |
+
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
|
26 |
+
|
27 |
+
class Cipher:
|
28 |
+
def __init__(self, js: str):
|
29 |
+
self.transform_plan: List[str] = get_transform_plan(js)
|
30 |
+
var_regex = re.compile(r"^\w+\W")
|
31 |
+
var_match = var_regex.search(self.transform_plan[0])
|
32 |
+
if not var_match:
|
33 |
+
raise RegexMatchError(
|
34 |
+
caller="__init__", pattern=var_regex.pattern
|
35 |
+
)
|
36 |
+
var = var_match.group(0)[:-1]
|
37 |
+
self.transform_map = get_transform_map(js, var)
|
38 |
+
self.js_func_patterns = [
|
39 |
+
r"\w+\.(\w+)\(\w,(\d+)\)",
|
40 |
+
r"\w+\[(\"\w+\")\]\(\w,(\d+)\)"
|
41 |
+
]
|
42 |
+
|
43 |
+
self.throttling_plan = get_throttling_plan(js)
|
44 |
+
self.throttling_array = get_throttling_function_array(js)
|
45 |
+
|
46 |
+
self.calculated_n = None
|
47 |
+
|
48 |
+
def calculate_n(self, initial_n: list):
|
49 |
+
"""Converts n to the correct value to prevent throttling."""
|
50 |
+
if self.calculated_n:
|
51 |
+
return self.calculated_n
|
52 |
+
|
53 |
+
# First, update all instances of 'b' with the list(initial_n)
|
54 |
+
for i in range(len(self.throttling_array)):
|
55 |
+
if self.throttling_array[i] == 'b':
|
56 |
+
self.throttling_array[i] = initial_n
|
57 |
+
|
58 |
+
for step in self.throttling_plan:
|
59 |
+
curr_func = self.throttling_array[int(step[0])]
|
60 |
+
if not callable(curr_func):
|
61 |
+
logger.debug(f'{curr_func} is not callable.')
|
62 |
+
logger.debug(f'Throttling array:\n{self.throttling_array}\n')
|
63 |
+
raise ExtractError(f'{curr_func} is not callable.')
|
64 |
+
|
65 |
+
first_arg = self.throttling_array[int(step[1])]
|
66 |
+
|
67 |
+
if len(step) == 2:
|
68 |
+
curr_func(first_arg)
|
69 |
+
elif len(step) == 3:
|
70 |
+
second_arg = self.throttling_array[int(step[2])]
|
71 |
+
curr_func(first_arg, second_arg)
|
72 |
+
|
73 |
+
self.calculated_n = ''.join(initial_n)
|
74 |
+
return self.calculated_n
|
75 |
+
|
76 |
+
def get_signature(self, ciphered_signature: str) -> str:
|
77 |
+
"""Decipher the signature.
|
78 |
+
|
79 |
+
Taking the ciphered signature, applies the transform functions.
|
80 |
+
|
81 |
+
:param str ciphered_signature:
|
82 |
+
The ciphered signature sent in the ``player_config``.
|
83 |
+
:rtype: str
|
84 |
+
:returns:
|
85 |
+
Decrypted signature required to download the media content.
|
86 |
+
"""
|
87 |
+
signature = list(ciphered_signature)
|
88 |
+
|
89 |
+
for js_func in self.transform_plan:
|
90 |
+
name, argument = self.parse_function(js_func) # type: ignore
|
91 |
+
signature = self.transform_map[name](signature, argument)
|
92 |
+
logger.debug(
|
93 |
+
"applied transform function\n"
|
94 |
+
"output: %s\n"
|
95 |
+
"js_function: %s\n"
|
96 |
+
"argument: %d\n"
|
97 |
+
"function: %s",
|
98 |
+
"".join(signature),
|
99 |
+
name,
|
100 |
+
argument,
|
101 |
+
self.transform_map[name],
|
102 |
+
)
|
103 |
+
|
104 |
+
return "".join(signature)
|
105 |
+
|
106 |
+
@cache
|
107 |
+
def parse_function(self, js_func: str) -> Tuple[str, int]:
|
108 |
+
"""Parse the Javascript transform function.
|
109 |
+
|
110 |
+
Break a JavaScript transform function down into a two element ``tuple``
|
111 |
+
containing the function name and some integer-based argument.
|
112 |
+
|
113 |
+
:param str js_func:
|
114 |
+
The JavaScript version of the transform function.
|
115 |
+
:rtype: tuple
|
116 |
+
:returns:
|
117 |
+
two element tuple containing the function name and an argument.
|
118 |
+
|
119 |
+
**Example**:
|
120 |
+
|
121 |
+
parse_function('DE.AJ(a,15)')
|
122 |
+
('AJ', 15)
|
123 |
+
|
124 |
+
"""
|
125 |
+
logger.debug("parsing transform function")
|
126 |
+
for pattern in self.js_func_patterns:
|
127 |
+
regex = re.compile(pattern)
|
128 |
+
parse_match = regex.search(js_func)
|
129 |
+
if parse_match:
|
130 |
+
fn_name, fn_arg = parse_match.groups()
|
131 |
+
return fn_name, int(fn_arg)
|
132 |
+
|
133 |
+
raise RegexMatchError(
|
134 |
+
caller="parse_function", pattern="js_func_patterns"
|
135 |
+
)
|
136 |
+
|
137 |
+
|
138 |
+
def get_initial_function_name(js: str) -> str:
|
139 |
+
"""Extract the name of the function responsible for computing the signature.
|
140 |
+
:param str js:
|
141 |
+
The contents of the base.js asset file.
|
142 |
+
:rtype: str
|
143 |
+
:returns:
|
144 |
+
Function name from regex match
|
145 |
+
"""
|
146 |
+
|
147 |
+
function_patterns = [
|
148 |
+
r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
149 |
+
r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
150 |
+
r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
|
151 |
+
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
|
152 |
+
r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
|
153 |
+
r"\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(",
|
154 |
+
r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
155 |
+
r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
156 |
+
r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
157 |
+
r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
158 |
+
r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
159 |
+
r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
160 |
+
]
|
161 |
+
logger.debug("finding initial function name")
|
162 |
+
for pattern in function_patterns:
|
163 |
+
regex = re.compile(pattern)
|
164 |
+
function_match = regex.search(js)
|
165 |
+
if function_match:
|
166 |
+
logger.debug("finished regex search, matched: %s", pattern)
|
167 |
+
return function_match.group(1)
|
168 |
+
|
169 |
+
raise RegexMatchError(
|
170 |
+
caller="get_initial_function_name", pattern="multiple"
|
171 |
+
)
|
172 |
+
|
173 |
+
|
174 |
+
def get_transform_plan(js: str) -> List[str]:
|
175 |
+
"""Extract the "transform plan".
|
176 |
+
|
177 |
+
The "transform plan" is the functions that the ciphered signature is
|
178 |
+
cycled through to obtain the actual signature.
|
179 |
+
|
180 |
+
:param str js:
|
181 |
+
The contents of the base.js asset file.
|
182 |
+
|
183 |
+
**Example**:
|
184 |
+
|
185 |
+
['DE.AJ(a,15)',
|
186 |
+
'DE.VR(a,3)',
|
187 |
+
'DE.AJ(a,51)',
|
188 |
+
'DE.VR(a,3)',
|
189 |
+
'DE.kT(a,51)',
|
190 |
+
'DE.kT(a,8)',
|
191 |
+
'DE.VR(a,3)',
|
192 |
+
'DE.kT(a,21)']
|
193 |
+
"""
|
194 |
+
name = re.escape(get_initial_function_name(js))
|
195 |
+
pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name
|
196 |
+
logger.debug("getting transform plan")
|
197 |
+
return regex_search(pattern, js, group=1).split(";")
|
198 |
+
|
199 |
+
|
200 |
+
def get_transform_object(js: str, var: str) -> List[str]:
|
201 |
+
"""Extract the "transform object".
|
202 |
+
|
203 |
+
The "transform object" contains the function definitions referenced in the
|
204 |
+
"transform plan". The ``var`` argument is the obfuscated variable name
|
205 |
+
which contains these functions, for example, given the function call
|
206 |
+
``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var.
|
207 |
+
|
208 |
+
:param str js:
|
209 |
+
The contents of the base.js asset file.
|
210 |
+
:param str var:
|
211 |
+
The obfuscated variable name that stores an object with all functions
|
212 |
+
that descrambles the signature.
|
213 |
+
|
214 |
+
**Example**:
|
215 |
+
|
216 |
+
>>> get_transform_object(js, 'DE')
|
217 |
+
['AJ:function(a){a.reverse()}',
|
218 |
+
'VR:function(a,b){a.splice(0,b)}',
|
219 |
+
'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}']
|
220 |
+
|
221 |
+
"""
|
222 |
+
pattern = r"var %s={(.*?)};" % re.escape(var)
|
223 |
+
logger.debug("getting transform object")
|
224 |
+
regex = re.compile(pattern, flags=re.DOTALL)
|
225 |
+
transform_match = regex.search(js)
|
226 |
+
if not transform_match:
|
227 |
+
raise RegexMatchError(caller="get_transform_object", pattern=pattern)
|
228 |
+
|
229 |
+
return transform_match.group(1).replace("\n", " ").split(", ")
|
230 |
+
|
231 |
+
|
232 |
+
def get_transform_map(js: str, var: str) -> Dict:
|
233 |
+
"""Build a transform function lookup.
|
234 |
+
|
235 |
+
Build a lookup table of obfuscated JavaScript function names to the
|
236 |
+
Python equivalents.
|
237 |
+
|
238 |
+
:param str js:
|
239 |
+
The contents of the base.js asset file.
|
240 |
+
:param str var:
|
241 |
+
The obfuscated variable name that stores an object with all functions
|
242 |
+
that descrambles the signature.
|
243 |
+
|
244 |
+
"""
|
245 |
+
transform_object = get_transform_object(js, var)
|
246 |
+
mapper = {}
|
247 |
+
for obj in transform_object:
|
248 |
+
# AJ:function(a){a.reverse()} => AJ, function(a){a.reverse()}
|
249 |
+
name, function = obj.split(":", 1)
|
250 |
+
fn = map_functions(function)
|
251 |
+
mapper[name] = fn
|
252 |
+
return mapper
|
253 |
+
|
254 |
+
|
255 |
+
def get_throttling_function_name(js: str) -> str:
|
256 |
+
"""Extract the name of the function that computes the throttling parameter.
|
257 |
+
|
258 |
+
:param str js:
|
259 |
+
The contents of the base.js asset file.
|
260 |
+
:rtype: str
|
261 |
+
:returns:
|
262 |
+
The name of the function used to compute the throttling parameter.
|
263 |
+
"""
|
264 |
+
function_patterns = [
|
265 |
+
# https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
|
266 |
+
# https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
|
267 |
+
# var Bpa = [iha];
|
268 |
+
# ...
|
269 |
+
# a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
|
270 |
+
# Bpa.length || iha("")) }};
|
271 |
+
# In the above case, `iha` is the relevant function name
|
272 |
+
r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
|
273 |
+
r'\([a-z]\s*=\s*([a-zA-Z0-9$]+)(\[\d+\])?\([a-z]\)',
|
274 |
+
]
|
275 |
+
logger.debug('Finding throttling function name')
|
276 |
+
for pattern in function_patterns:
|
277 |
+
regex = re.compile(pattern)
|
278 |
+
function_match = regex.search(js)
|
279 |
+
if function_match:
|
280 |
+
logger.debug("finished regex search, matched: %s", pattern)
|
281 |
+
if len(function_match.groups()) == 1:
|
282 |
+
return function_match.group(1)
|
283 |
+
idx = function_match.group(2)
|
284 |
+
if idx:
|
285 |
+
idx = idx.strip("[]")
|
286 |
+
array = re.search(
|
287 |
+
r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
|
288 |
+
nfunc=re.escape(function_match.group(1))),
|
289 |
+
js
|
290 |
+
)
|
291 |
+
if array:
|
292 |
+
array = array.group(1).strip("[]").split(",")
|
293 |
+
array = [x.strip() for x in array]
|
294 |
+
return array[int(idx)]
|
295 |
+
|
296 |
+
raise RegexMatchError(
|
297 |
+
caller="get_throttling_function_name", pattern="multiple"
|
298 |
+
)
|
299 |
+
|
300 |
+
|
301 |
+
def get_throttling_function_code(js: str) -> str:
|
302 |
+
"""Extract the raw code for the throttling function.
|
303 |
+
|
304 |
+
:param str js:
|
305 |
+
The contents of the base.js asset file.
|
306 |
+
:rtype: str
|
307 |
+
:returns:
|
308 |
+
The name of the function used to compute the throttling parameter.
|
309 |
+
"""
|
310 |
+
# Begin by extracting the correct function name
|
311 |
+
name = re.escape(get_throttling_function_name(js))
|
312 |
+
|
313 |
+
# Identify where the function is defined
|
314 |
+
pattern_start = r"%s=function\(\w\)" % name
|
315 |
+
regex = re.compile(pattern_start)
|
316 |
+
match = regex.search(js)
|
317 |
+
|
318 |
+
# Extract the code within curly braces for the function itself, and merge any split lines
|
319 |
+
code_lines_list = find_object_from_startpoint(js, match.span()[1]).split('\n')
|
320 |
+
joined_lines = "".join(code_lines_list)
|
321 |
+
|
322 |
+
# Prepend function definition (e.g. `Dea=function(a)`)
|
323 |
+
return match.group(0) + joined_lines
|
324 |
+
|
325 |
+
|
326 |
+
def get_throttling_function_array(js: str) -> List[Any]:
|
327 |
+
"""Extract the "c" array.
|
328 |
+
|
329 |
+
:param str js:
|
330 |
+
The contents of the base.js asset file.
|
331 |
+
:returns:
|
332 |
+
The array of various integers, arrays, and functions.
|
333 |
+
"""
|
334 |
+
raw_code = get_throttling_function_code(js)
|
335 |
+
|
336 |
+
array_start = r",c=\["
|
337 |
+
array_regex = re.compile(array_start)
|
338 |
+
match = array_regex.search(raw_code)
|
339 |
+
|
340 |
+
array_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
|
341 |
+
str_array = throttling_array_split(array_raw)
|
342 |
+
|
343 |
+
converted_array = []
|
344 |
+
for el in str_array:
|
345 |
+
try:
|
346 |
+
converted_array.append(int(el))
|
347 |
+
continue
|
348 |
+
except ValueError:
|
349 |
+
# Not an integer value.
|
350 |
+
pass
|
351 |
+
|
352 |
+
if el == 'null':
|
353 |
+
converted_array.append(None)
|
354 |
+
continue
|
355 |
+
|
356 |
+
if el.startswith('"') and el.endswith('"'):
|
357 |
+
# Convert e.g. '"abcdef"' to string without quotation marks, 'abcdef'
|
358 |
+
converted_array.append(el[1:-1])
|
359 |
+
continue
|
360 |
+
|
361 |
+
if el.startswith('function'):
|
362 |
+
mapper = (
|
363 |
+
(r"{for\(\w=\(\w%\w\.length\+\w\.length\)%\w\.length;\w--;\)\w\.unshift\(\w.pop\(\)\)}", throttling_unshift), # noqa:E501
|
364 |
+
(r"{\w\.reverse\(\)}", throttling_reverse),
|
365 |
+
(r"{\w\.push\(\w\)}", throttling_push),
|
366 |
+
(r";var\s\w=\w\[0\];\w\[0\]=\w\[\w\];\w\[\w\]=\w}", throttling_swap),
|
367 |
+
(r"case\s\d+", throttling_cipher_function),
|
368 |
+
(r"\w\.splice\(0,1,\w\.splice\(\w,1,\w\[0\]\)\[0\]\)", throttling_nested_splice), # noqa:E501
|
369 |
+
(r";\w\.splice\(\w,1\)}", js_splice),
|
370 |
+
(r"\w\.splice\(-\w\)\.reverse\(\)\.forEach\(function\(\w\){\w\.unshift\(\w\)}\)", throttling_prepend), # noqa:E501
|
371 |
+
(r"for\(var \w=\w\.length;\w;\)\w\.push\(\w\.splice\(--\w,1\)\[0\]\)}", throttling_reverse), # noqa:E501
|
372 |
+
)
|
373 |
+
|
374 |
+
found = False
|
375 |
+
for pattern, fn in mapper:
|
376 |
+
if re.search(pattern, el):
|
377 |
+
converted_array.append(fn)
|
378 |
+
found = True
|
379 |
+
if found:
|
380 |
+
continue
|
381 |
+
|
382 |
+
converted_array.append(el)
|
383 |
+
|
384 |
+
# Replace null elements with array itself
|
385 |
+
for i in range(len(converted_array)):
|
386 |
+
if converted_array[i] is None:
|
387 |
+
converted_array[i] = converted_array
|
388 |
+
|
389 |
+
return converted_array
|
390 |
+
|
391 |
+
|
392 |
+
def get_throttling_plan(js: str):
|
393 |
+
"""Extract the "throttling plan".
|
394 |
+
|
395 |
+
The "throttling plan" is a list of tuples used for calling functions
|
396 |
+
in the c array. The first element of the tuple is the index of the
|
397 |
+
function to call, and any remaining elements of the tuple are arguments
|
398 |
+
to pass to that function.
|
399 |
+
|
400 |
+
:param str js:
|
401 |
+
The contents of the base.js asset file.
|
402 |
+
:returns:
|
403 |
+
The full function code for computing the throttlign parameter.
|
404 |
+
"""
|
405 |
+
raw_code = get_throttling_function_code(js)
|
406 |
+
|
407 |
+
transform_start = r"try{"
|
408 |
+
plan_regex = re.compile(transform_start)
|
409 |
+
match = plan_regex.search(raw_code)
|
410 |
+
|
411 |
+
transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
|
412 |
+
|
413 |
+
# Steps are either c[x](c[y]) or c[x](c[y],c[z])
|
414 |
+
step_start = r"c\[(\d+)\]\(c\[(\d+)\](,c(\[(\d+)\]))?\)"
|
415 |
+
step_regex = re.compile(step_start)
|
416 |
+
matches = step_regex.findall(transform_plan_raw)
|
417 |
+
transform_steps = []
|
418 |
+
for match in matches:
|
419 |
+
if match[4] != '':
|
420 |
+
transform_steps.append((match[0],match[1],match[4]))
|
421 |
+
else:
|
422 |
+
transform_steps.append((match[0],match[1]))
|
423 |
+
|
424 |
+
return transform_steps
|
425 |
+
|
426 |
+
|
427 |
+
def reverse(arr: List, _: Optional[Any]):
|
428 |
+
"""Reverse elements in a list.
|
429 |
+
|
430 |
+
This function is equivalent to:
|
431 |
+
|
432 |
+
.. code-block:: javascript
|
433 |
+
|
434 |
+
function(a, b) { a.reverse() }
|
435 |
+
|
436 |
+
This method takes an unused ``b`` variable as their transform functions
|
437 |
+
universally sent two arguments.
|
438 |
+
|
439 |
+
**Example**:
|
440 |
+
|
441 |
+
>>> reverse([1, 2, 3, 4])
|
442 |
+
[4, 3, 2, 1]
|
443 |
+
"""
|
444 |
+
return arr[::-1]
|
445 |
+
|
446 |
+
|
447 |
+
def splice(arr: List, b: int):
|
448 |
+
"""Add/remove items to/from a list.
|
449 |
+
|
450 |
+
This function is equivalent to:
|
451 |
+
|
452 |
+
.. code-block:: javascript
|
453 |
+
|
454 |
+
function(a, b) { a.splice(0, b) }
|
455 |
+
|
456 |
+
**Example**:
|
457 |
+
|
458 |
+
>>> splice([1, 2, 3, 4], 2)
|
459 |
+
[1, 2]
|
460 |
+
"""
|
461 |
+
return arr[b:]
|
462 |
+
|
463 |
+
|
464 |
+
def swap(arr: List, b: int):
|
465 |
+
"""Swap positions at b modulus the list length.
|
466 |
+
|
467 |
+
This function is equivalent to:
|
468 |
+
|
469 |
+
.. code-block:: javascript
|
470 |
+
|
471 |
+
function(a, b) { var c=a[0];a[0]=a[b%a.length];a[b]=c }
|
472 |
+
|
473 |
+
**Example**:
|
474 |
+
|
475 |
+
>>> swap([1, 2, 3, 4], 2)
|
476 |
+
[3, 2, 1, 4]
|
477 |
+
"""
|
478 |
+
r = b % len(arr)
|
479 |
+
return list(chain([arr[r]], arr[1:r], [arr[0]], arr[r + 1 :]))
|
480 |
+
|
481 |
+
|
482 |
+
def throttling_reverse(arr: list):
|
483 |
+
"""Reverses the input list.
|
484 |
+
|
485 |
+
Needs to do an in-place reversal so that the passed list gets changed.
|
486 |
+
To accomplish this, we create a reversed copy, and then change each
|
487 |
+
indvidual element.
|
488 |
+
"""
|
489 |
+
reverse_copy = arr.copy()[::-1]
|
490 |
+
for i in range(len(reverse_copy)):
|
491 |
+
arr[i] = reverse_copy[i]
|
492 |
+
|
493 |
+
|
494 |
+
def throttling_push(d: list, e: Any):
|
495 |
+
"""Pushes an element onto a list."""
|
496 |
+
d.append(e)
|
497 |
+
|
498 |
+
|
499 |
+
def throttling_mod_func(d: list, e: int):
|
500 |
+
"""Perform the modular function from the throttling array functions.
|
501 |
+
|
502 |
+
In the javascript, the modular operation is as follows:
|
503 |
+
e = (e % d.length + d.length) % d.length
|
504 |
+
|
505 |
+
We simply translate this to python here.
|
506 |
+
"""
|
507 |
+
return (e % len(d) + len(d)) % len(d)
|
508 |
+
|
509 |
+
|
510 |
+
def throttling_unshift(d: list, e: int):
|
511 |
+
"""Rotates the elements of the list to the right.
|
512 |
+
|
513 |
+
In the javascript, the operation is as follows:
|
514 |
+
for(e=(e%d.length+d.length)%d.length;e--;)d.unshift(d.pop())
|
515 |
+
"""
|
516 |
+
e = throttling_mod_func(d, e)
|
517 |
+
new_arr = d[-e:] + d[:-e]
|
518 |
+
d.clear()
|
519 |
+
for el in new_arr:
|
520 |
+
d.append(el)
|
521 |
+
|
522 |
+
|
523 |
+
def throttling_cipher_function(d: list, e: str):
|
524 |
+
"""This ciphers d with e to generate a new list.
|
525 |
+
|
526 |
+
In the javascript, the operation is as follows:
|
527 |
+
var h = [A-Za-z0-9-_], f = 96; // simplified from switch-case loop
|
528 |
+
d.forEach(
|
529 |
+
function(l,m,n){
|
530 |
+
this.push(
|
531 |
+
n[m]=h[
|
532 |
+
(h.indexOf(l)-h.indexOf(this[m])+m-32+f--)%h.length
|
533 |
+
]
|
534 |
+
)
|
535 |
+
},
|
536 |
+
e.split("")
|
537 |
+
)
|
538 |
+
"""
|
539 |
+
h = list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_')
|
540 |
+
f = 96
|
541 |
+
# by naming it "this" we can more closely reflect the js
|
542 |
+
this = list(e)
|
543 |
+
|
544 |
+
# This is so we don't run into weirdness with enumerate while
|
545 |
+
# we change the input list
|
546 |
+
copied_list = d.copy()
|
547 |
+
|
548 |
+
for m, l in enumerate(copied_list):
|
549 |
+
bracket_val = (h.index(l) - h.index(this[m]) + m - 32 + f) % len(h)
|
550 |
+
this.append(
|
551 |
+
h[bracket_val]
|
552 |
+
)
|
553 |
+
d[m] = h[bracket_val]
|
554 |
+
f -= 1
|
555 |
+
|
556 |
+
|
557 |
+
def throttling_nested_splice(d: list, e: int):
|
558 |
+
"""Nested splice function in throttling js.
|
559 |
+
|
560 |
+
In the javascript, the operation is as follows:
|
561 |
+
function(d,e){
|
562 |
+
e=(e%d.length+d.length)%d.length;
|
563 |
+
d.splice(
|
564 |
+
0,
|
565 |
+
1,
|
566 |
+
d.splice(
|
567 |
+
e,
|
568 |
+
1,
|
569 |
+
d[0]
|
570 |
+
)[0]
|
571 |
+
)
|
572 |
+
}
|
573 |
+
|
574 |
+
While testing, all this seemed to do is swap element 0 and e,
|
575 |
+
but the actual process is preserved in case there was an edge
|
576 |
+
case that was not considered.
|
577 |
+
"""
|
578 |
+
e = throttling_mod_func(d, e)
|
579 |
+
inner_splice = js_splice(
|
580 |
+
d,
|
581 |
+
e,
|
582 |
+
1,
|
583 |
+
d[0]
|
584 |
+
)
|
585 |
+
js_splice(
|
586 |
+
d,
|
587 |
+
0,
|
588 |
+
1,
|
589 |
+
inner_splice[0]
|
590 |
+
)
|
591 |
+
|
592 |
+
|
593 |
+
def throttling_prepend(d: list, e: int):
|
594 |
+
"""
|
595 |
+
|
596 |
+
In the javascript, the operation is as follows:
|
597 |
+
function(d,e){
|
598 |
+
e=(e%d.length+d.length)%d.length;
|
599 |
+
d.splice(-e).reverse().forEach(
|
600 |
+
function(f){
|
601 |
+
d.unshift(f)
|
602 |
+
}
|
603 |
+
)
|
604 |
+
}
|
605 |
+
|
606 |
+
Effectively, this moves the last e elements of d to the beginning.
|
607 |
+
"""
|
608 |
+
start_len = len(d)
|
609 |
+
# First, calculate e
|
610 |
+
e = throttling_mod_func(d, e)
|
611 |
+
|
612 |
+
# Then do the prepending
|
613 |
+
new_arr = d[-e:] + d[:-e]
|
614 |
+
|
615 |
+
# And update the input list
|
616 |
+
d.clear()
|
617 |
+
for el in new_arr:
|
618 |
+
d.append(el)
|
619 |
+
|
620 |
+
end_len = len(d)
|
621 |
+
assert start_len == end_len
|
622 |
+
|
623 |
+
|
624 |
+
def throttling_swap(d: list, e: int):
|
625 |
+
"""Swap positions of the 0'th and e'th elements in-place."""
|
626 |
+
e = throttling_mod_func(d, e)
|
627 |
+
f = d[0]
|
628 |
+
d[0] = d[e]
|
629 |
+
d[e] = f
|
630 |
+
|
631 |
+
|
632 |
+
def js_splice(arr: list, start: int, delete_count=None, *items):
|
633 |
+
"""Implementation of javascript's splice function.
|
634 |
+
|
635 |
+
:param list arr:
|
636 |
+
Array to splice
|
637 |
+
:param int start:
|
638 |
+
Index at which to start changing the array
|
639 |
+
:param int delete_count:
|
640 |
+
Number of elements to delete from the array
|
641 |
+
:param *items:
|
642 |
+
Items to add to the array
|
643 |
+
|
644 |
+
Reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice # noqa:E501
|
645 |
+
"""
|
646 |
+
# Special conditions for start value
|
647 |
+
try:
|
648 |
+
if start > len(arr):
|
649 |
+
start = len(arr)
|
650 |
+
# If start is negative, count backwards from end
|
651 |
+
if start < 0:
|
652 |
+
start = len(arr) - start
|
653 |
+
except TypeError:
|
654 |
+
# Non-integer start values are treated as 0 in js
|
655 |
+
start = 0
|
656 |
+
|
657 |
+
# Special condition when delete_count is greater than remaining elements
|
658 |
+
if not delete_count or delete_count >= len(arr) - start:
|
659 |
+
delete_count = len(arr) - start # noqa: N806
|
660 |
+
|
661 |
+
deleted_elements = arr[start:start + delete_count]
|
662 |
+
|
663 |
+
# Splice appropriately.
|
664 |
+
new_arr = arr[:start] + list(items) + arr[start + delete_count:]
|
665 |
+
|
666 |
+
# Replace contents of input array
|
667 |
+
arr.clear()
|
668 |
+
for el in new_arr:
|
669 |
+
arr.append(el)
|
670 |
+
|
671 |
+
return deleted_elements
|
672 |
+
|
673 |
+
|
674 |
+
def map_functions(js_func: str) -> Callable:
|
675 |
+
"""For a given JavaScript transform function, return the Python equivalent.
|
676 |
+
|
677 |
+
:param str js_func:
|
678 |
+
The JavaScript version of the transform function.
|
679 |
+
"""
|
680 |
+
mapper = (
|
681 |
+
# function(a){a.reverse()}
|
682 |
+
(r"{\w\.reverse\(\)}", reverse),
|
683 |
+
# function(a,b){a.splice(0,b)}
|
684 |
+
(r"{\w\.splice\(0,\w\)}", splice),
|
685 |
+
# function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}
|
686 |
+
(r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\]=\w}", swap),
|
687 |
+
# function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c}
|
688 |
+
(
|
689 |
+
r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\%\w.length\]=\w}",
|
690 |
+
swap,
|
691 |
+
),
|
692 |
+
)
|
693 |
+
|
694 |
+
for pattern, fn in mapper:
|
695 |
+
if re.search(pattern, js_func):
|
696 |
+
return fn
|
697 |
+
raise RegexMatchError(caller="map_functions", pattern="multiple")
|
pytube/cli.py
ADDED
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""A simple command line application to download youtube videos."""
|
3 |
+
import argparse
|
4 |
+
import gzip
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
import shutil
|
9 |
+
import sys
|
10 |
+
import datetime as dt
|
11 |
+
import subprocess # nosec
|
12 |
+
from typing import List, Optional
|
13 |
+
|
14 |
+
import pytube.exceptions as exceptions
|
15 |
+
from pytube import __version__
|
16 |
+
from pytube import CaptionQuery, Playlist, Stream, YouTube
|
17 |
+
from pytube.helpers import safe_filename, setup_logger
|
18 |
+
|
19 |
+
|
20 |
+
logger = logging.getLogger(__name__)
|
21 |
+
|
22 |
+
|
23 |
+
def main():
|
24 |
+
"""Command line application to download youtube videos."""
|
25 |
+
# noinspection PyTypeChecker
|
26 |
+
parser = argparse.ArgumentParser(description=main.__doc__)
|
27 |
+
args = _parse_args(parser)
|
28 |
+
if args.verbose:
|
29 |
+
log_filename = None
|
30 |
+
if args.logfile:
|
31 |
+
log_filename = args.logfile
|
32 |
+
setup_logger(logging.DEBUG, log_filename=log_filename)
|
33 |
+
logger.debug(f'Pytube version: {__version__}')
|
34 |
+
|
35 |
+
if not args.url or "youtu" not in args.url:
|
36 |
+
parser.print_help()
|
37 |
+
sys.exit(1)
|
38 |
+
|
39 |
+
if "/playlist" in args.url:
|
40 |
+
print("Loading playlist...")
|
41 |
+
playlist = Playlist(args.url)
|
42 |
+
if not args.target:
|
43 |
+
args.target = safe_filename(playlist.title)
|
44 |
+
for youtube_video in playlist.videos:
|
45 |
+
try:
|
46 |
+
_perform_args_on_youtube(youtube_video, args)
|
47 |
+
except exceptions.PytubeError as e:
|
48 |
+
print(f"There was an error with video: {youtube_video}")
|
49 |
+
print(e)
|
50 |
+
else:
|
51 |
+
print("Loading video...")
|
52 |
+
youtube = YouTube(args.url)
|
53 |
+
_perform_args_on_youtube(youtube, args)
|
54 |
+
|
55 |
+
|
56 |
+
def _perform_args_on_youtube(
|
57 |
+
youtube: YouTube, args: argparse.Namespace
|
58 |
+
) -> None:
|
59 |
+
if len(sys.argv) == 2 : # no arguments parsed
|
60 |
+
download_highest_resolution_progressive(
|
61 |
+
youtube=youtube, resolution="highest", target=args.target
|
62 |
+
)
|
63 |
+
if args.list_captions:
|
64 |
+
_print_available_captions(youtube.captions)
|
65 |
+
if args.list:
|
66 |
+
display_streams(youtube)
|
67 |
+
if args.build_playback_report:
|
68 |
+
build_playback_report(youtube)
|
69 |
+
if args.itag:
|
70 |
+
download_by_itag(youtube=youtube, itag=args.itag, target=args.target)
|
71 |
+
if args.caption_code:
|
72 |
+
download_caption(
|
73 |
+
youtube=youtube, lang_code=args.caption_code, target=args.target
|
74 |
+
)
|
75 |
+
if args.resolution:
|
76 |
+
download_by_resolution(
|
77 |
+
youtube=youtube, resolution=args.resolution, target=args.target
|
78 |
+
)
|
79 |
+
if args.audio:
|
80 |
+
download_audio(
|
81 |
+
youtube=youtube, filetype=args.audio, target=args.target
|
82 |
+
)
|
83 |
+
if args.ffmpeg:
|
84 |
+
ffmpeg_process(
|
85 |
+
youtube=youtube, resolution=args.ffmpeg, target=args.target
|
86 |
+
)
|
87 |
+
|
88 |
+
|
89 |
+
def _parse_args(
|
90 |
+
parser: argparse.ArgumentParser, args: Optional[List] = None
|
91 |
+
) -> argparse.Namespace:
|
92 |
+
parser.add_argument(
|
93 |
+
"url", help="The YouTube /watch or /playlist url", nargs="?"
|
94 |
+
)
|
95 |
+
parser.add_argument(
|
96 |
+
"--version", action="version", version="%(prog)s " + __version__,
|
97 |
+
)
|
98 |
+
parser.add_argument(
|
99 |
+
"--itag", type=int, help="The itag for the desired stream",
|
100 |
+
)
|
101 |
+
parser.add_argument(
|
102 |
+
"-r",
|
103 |
+
"--resolution",
|
104 |
+
type=str,
|
105 |
+
help="The resolution for the desired stream",
|
106 |
+
)
|
107 |
+
parser.add_argument(
|
108 |
+
"-l",
|
109 |
+
"--list",
|
110 |
+
action="store_true",
|
111 |
+
help=(
|
112 |
+
"The list option causes pytube cli to return a list of streams "
|
113 |
+
"available to download"
|
114 |
+
),
|
115 |
+
)
|
116 |
+
parser.add_argument(
|
117 |
+
"-v",
|
118 |
+
"--verbose",
|
119 |
+
action="store_true",
|
120 |
+
dest="verbose",
|
121 |
+
help="Set logger output to verbose output.",
|
122 |
+
)
|
123 |
+
parser.add_argument(
|
124 |
+
"--logfile",
|
125 |
+
action="store",
|
126 |
+
help="logging debug and error messages into a log file",
|
127 |
+
)
|
128 |
+
parser.add_argument(
|
129 |
+
"--build-playback-report",
|
130 |
+
action="store_true",
|
131 |
+
help="Save the html and js to disk",
|
132 |
+
)
|
133 |
+
parser.add_argument(
|
134 |
+
"-c",
|
135 |
+
"--caption-code",
|
136 |
+
type=str,
|
137 |
+
help=(
|
138 |
+
"Download srt captions for given language code. "
|
139 |
+
"Prints available language codes if no argument given"
|
140 |
+
),
|
141 |
+
)
|
142 |
+
parser.add_argument(
|
143 |
+
'-lc',
|
144 |
+
'--list-captions',
|
145 |
+
action='store_true',
|
146 |
+
help=(
|
147 |
+
"List available caption codes for a video"
|
148 |
+
)
|
149 |
+
)
|
150 |
+
parser.add_argument(
|
151 |
+
"-t",
|
152 |
+
"--target",
|
153 |
+
help=(
|
154 |
+
"The output directory for the downloaded stream. "
|
155 |
+
"Default is current working directory"
|
156 |
+
),
|
157 |
+
)
|
158 |
+
parser.add_argument(
|
159 |
+
"-a",
|
160 |
+
"--audio",
|
161 |
+
const="mp4",
|
162 |
+
nargs="?",
|
163 |
+
help=(
|
164 |
+
"Download the audio for a given URL at the highest bitrate available"
|
165 |
+
"Defaults to mp4 format if none is specified"
|
166 |
+
),
|
167 |
+
)
|
168 |
+
parser.add_argument(
|
169 |
+
"-f",
|
170 |
+
"--ffmpeg",
|
171 |
+
const="best",
|
172 |
+
nargs="?",
|
173 |
+
help=(
|
174 |
+
"Downloads the audio and video stream for resolution provided"
|
175 |
+
"If no resolution is provided, downloads the best resolution"
|
176 |
+
"Runs the command line program ffmpeg to combine the audio and video"
|
177 |
+
),
|
178 |
+
)
|
179 |
+
|
180 |
+
return parser.parse_args(args)
|
181 |
+
|
182 |
+
|
183 |
+
def build_playback_report(youtube: YouTube) -> None:
|
184 |
+
"""Serialize the request data to json for offline debugging.
|
185 |
+
|
186 |
+
:param YouTube youtube:
|
187 |
+
A YouTube object.
|
188 |
+
"""
|
189 |
+
ts = int(dt.datetime.utcnow().timestamp())
|
190 |
+
fp = os.path.join(os.getcwd(), f"yt-video-{youtube.video_id}-{ts}.json.gz")
|
191 |
+
|
192 |
+
js = youtube.js
|
193 |
+
watch_html = youtube.watch_html
|
194 |
+
vid_info = youtube.vid_info
|
195 |
+
|
196 |
+
with gzip.open(fp, "wb") as fh:
|
197 |
+
fh.write(
|
198 |
+
json.dumps(
|
199 |
+
{
|
200 |
+
"url": youtube.watch_url,
|
201 |
+
"js": js,
|
202 |
+
"watch_html": watch_html,
|
203 |
+
"video_info": vid_info,
|
204 |
+
}
|
205 |
+
).encode("utf8"),
|
206 |
+
)
|
207 |
+
|
208 |
+
|
209 |
+
def display_progress_bar(
|
210 |
+
bytes_received: int, filesize: int, ch: str = "█", scale: float = 0.55
|
211 |
+
) -> None:
|
212 |
+
"""Display a simple, pretty progress bar.
|
213 |
+
|
214 |
+
Example:
|
215 |
+
~~~~~~~~
|
216 |
+
PSY - GANGNAM STYLE(강남스타일) MV.mp4
|
217 |
+
↳ |███████████████████████████████████████| 100.0%
|
218 |
+
|
219 |
+
:param int bytes_received:
|
220 |
+
The delta between the total file size (bytes) and bytes already
|
221 |
+
written to disk.
|
222 |
+
:param int filesize:
|
223 |
+
File size of the media stream in bytes.
|
224 |
+
:param str ch:
|
225 |
+
Character to use for presenting progress segment.
|
226 |
+
:param float scale:
|
227 |
+
Scale multiplier to reduce progress bar size.
|
228 |
+
|
229 |
+
"""
|
230 |
+
columns = shutil.get_terminal_size().columns
|
231 |
+
max_width = int(columns * scale)
|
232 |
+
|
233 |
+
filled = int(round(max_width * bytes_received / float(filesize)))
|
234 |
+
remaining = max_width - filled
|
235 |
+
progress_bar = ch * filled + " " * remaining
|
236 |
+
percent = round(100.0 * bytes_received / float(filesize), 1)
|
237 |
+
text = f" ↳ |{progress_bar}| {percent}%\r"
|
238 |
+
sys.stdout.write(text)
|
239 |
+
sys.stdout.flush()
|
240 |
+
|
241 |
+
|
242 |
+
# noinspection PyUnusedLocal
|
243 |
+
def on_progress(
|
244 |
+
stream: Stream, chunk: bytes, bytes_remaining: int
|
245 |
+
) -> None: # pylint: disable=W0613
|
246 |
+
filesize = stream.filesize
|
247 |
+
bytes_received = filesize - bytes_remaining
|
248 |
+
display_progress_bar(bytes_received, filesize)
|
249 |
+
|
250 |
+
|
251 |
+
def _download(
|
252 |
+
stream: Stream,
|
253 |
+
target: Optional[str] = None,
|
254 |
+
filename: Optional[str] = None,
|
255 |
+
) -> None:
|
256 |
+
filesize_megabytes = stream.filesize // 1048576
|
257 |
+
print(f"{filename or stream.default_filename} | {filesize_megabytes} MB")
|
258 |
+
file_path = stream.get_file_path(filename=filename, output_path=target)
|
259 |
+
if stream.exists_at_path(file_path):
|
260 |
+
print(f"Already downloaded at:\n{file_path}")
|
261 |
+
return
|
262 |
+
|
263 |
+
stream.download(output_path=target, filename=filename)
|
264 |
+
sys.stdout.write("\n")
|
265 |
+
|
266 |
+
|
267 |
+
def _unique_name(base: str, subtype: str, media_type: str, target: str) -> str:
|
268 |
+
"""
|
269 |
+
Given a base name, the file format, and the target directory, will generate
|
270 |
+
a filename unique for that directory and file format.
|
271 |
+
:param str base:
|
272 |
+
The given base-name.
|
273 |
+
:param str subtype:
|
274 |
+
The filetype of the video which will be downloaded.
|
275 |
+
:param str media_type:
|
276 |
+
The media_type of the file, ie. "audio" or "video"
|
277 |
+
:param Path target:
|
278 |
+
Target directory for download.
|
279 |
+
"""
|
280 |
+
counter = 0
|
281 |
+
while True:
|
282 |
+
file_name = f"{base}_{media_type}_{counter}"
|
283 |
+
file_path = os.path.join(target, f"{file_name}.{subtype}")
|
284 |
+
if not os.path.exists(file_path):
|
285 |
+
return file_name
|
286 |
+
counter += 1
|
287 |
+
|
288 |
+
|
289 |
+
def ffmpeg_process(
|
290 |
+
youtube: YouTube, resolution: str, target: Optional[str] = None
|
291 |
+
) -> None:
|
292 |
+
"""
|
293 |
+
Decides the correct video stream to download, then calls _ffmpeg_downloader.
|
294 |
+
|
295 |
+
:param YouTube youtube:
|
296 |
+
A valid YouTube object.
|
297 |
+
:param str resolution:
|
298 |
+
YouTube video resolution.
|
299 |
+
:param str target:
|
300 |
+
Target directory for download
|
301 |
+
"""
|
302 |
+
youtube.register_on_progress_callback(on_progress)
|
303 |
+
target = target or os.getcwd()
|
304 |
+
|
305 |
+
if resolution == "best":
|
306 |
+
highest_quality_stream = (
|
307 |
+
youtube.streams.filter(progressive=False)
|
308 |
+
.order_by("resolution")
|
309 |
+
.last()
|
310 |
+
)
|
311 |
+
mp4_stream = (
|
312 |
+
youtube.streams.filter(progressive=False, subtype="mp4")
|
313 |
+
.order_by("resolution")
|
314 |
+
.last()
|
315 |
+
)
|
316 |
+
if highest_quality_stream.resolution == mp4_stream.resolution:
|
317 |
+
video_stream = mp4_stream
|
318 |
+
else:
|
319 |
+
video_stream = highest_quality_stream
|
320 |
+
else:
|
321 |
+
video_stream = youtube.streams.filter(
|
322 |
+
progressive=False, resolution=resolution, subtype="mp4"
|
323 |
+
).first()
|
324 |
+
if not video_stream:
|
325 |
+
video_stream = youtube.streams.filter(
|
326 |
+
progressive=False, resolution=resolution
|
327 |
+
).first()
|
328 |
+
if video_stream is None:
|
329 |
+
print(f"Could not find a stream with resolution: {resolution}")
|
330 |
+
print("Try one of these:")
|
331 |
+
display_streams(youtube)
|
332 |
+
sys.exit()
|
333 |
+
|
334 |
+
audio_stream = youtube.streams.get_audio_only(video_stream.subtype)
|
335 |
+
if not audio_stream:
|
336 |
+
audio_stream = (
|
337 |
+
youtube.streams.filter(only_audio=True).order_by("abr").last()
|
338 |
+
)
|
339 |
+
if not audio_stream:
|
340 |
+
print("Could not find an audio only stream")
|
341 |
+
sys.exit()
|
342 |
+
_ffmpeg_downloader(
|
343 |
+
audio_stream=audio_stream, video_stream=video_stream, target=target
|
344 |
+
)
|
345 |
+
|
346 |
+
|
347 |
+
def _ffmpeg_downloader(
|
348 |
+
audio_stream: Stream, video_stream: Stream, target: str
|
349 |
+
) -> None:
|
350 |
+
"""
|
351 |
+
Given a YouTube Stream object, finds the correct audio stream, downloads them both
|
352 |
+
giving them a unique name, them uses ffmpeg to create a new file with the audio
|
353 |
+
and video from the previously downloaded files. Then deletes the original adaptive
|
354 |
+
streams, leaving the combination.
|
355 |
+
|
356 |
+
:param Stream audio_stream:
|
357 |
+
A valid Stream object representing the audio to download
|
358 |
+
:param Stream video_stream:
|
359 |
+
A valid Stream object representing the video to download
|
360 |
+
:param Path target:
|
361 |
+
A valid Path object
|
362 |
+
"""
|
363 |
+
video_unique_name = _unique_name(
|
364 |
+
safe_filename(video_stream.title),
|
365 |
+
video_stream.subtype,
|
366 |
+
"video",
|
367 |
+
target=target,
|
368 |
+
)
|
369 |
+
audio_unique_name = _unique_name(
|
370 |
+
safe_filename(video_stream.title),
|
371 |
+
audio_stream.subtype,
|
372 |
+
"audio",
|
373 |
+
target=target,
|
374 |
+
)
|
375 |
+
_download(stream=video_stream, target=target, filename=video_unique_name)
|
376 |
+
print("Loading audio...")
|
377 |
+
_download(stream=audio_stream, target=target, filename=audio_unique_name)
|
378 |
+
|
379 |
+
video_path = os.path.join(
|
380 |
+
target, f"{video_unique_name}.{video_stream.subtype}"
|
381 |
+
)
|
382 |
+
audio_path = os.path.join(
|
383 |
+
target, f"{audio_unique_name}.{audio_stream.subtype}"
|
384 |
+
)
|
385 |
+
final_path = os.path.join(
|
386 |
+
target, f"{safe_filename(video_stream.title)}.{video_stream.subtype}"
|
387 |
+
)
|
388 |
+
|
389 |
+
subprocess.run( # nosec
|
390 |
+
[
|
391 |
+
"ffmpeg",
|
392 |
+
"-i",
|
393 |
+
video_path,
|
394 |
+
"-i",
|
395 |
+
audio_path,
|
396 |
+
"-codec",
|
397 |
+
"copy",
|
398 |
+
final_path,
|
399 |
+
]
|
400 |
+
)
|
401 |
+
os.unlink(video_path)
|
402 |
+
os.unlink(audio_path)
|
403 |
+
|
404 |
+
|
405 |
+
def download_by_itag(
|
406 |
+
youtube: YouTube, itag: int, target: Optional[str] = None
|
407 |
+
) -> None:
|
408 |
+
"""Start downloading a YouTube video.
|
409 |
+
|
410 |
+
:param YouTube youtube:
|
411 |
+
A valid YouTube object.
|
412 |
+
:param int itag:
|
413 |
+
YouTube format identifier code.
|
414 |
+
:param str target:
|
415 |
+
Target directory for download
|
416 |
+
"""
|
417 |
+
stream = youtube.streams.get_by_itag(itag)
|
418 |
+
if stream is None:
|
419 |
+
print(f"Could not find a stream with itag: {itag}")
|
420 |
+
print("Try one of these:")
|
421 |
+
display_streams(youtube)
|
422 |
+
sys.exit()
|
423 |
+
|
424 |
+
youtube.register_on_progress_callback(on_progress)
|
425 |
+
|
426 |
+
try:
|
427 |
+
_download(stream, target=target)
|
428 |
+
except KeyboardInterrupt:
|
429 |
+
sys.exit()
|
430 |
+
|
431 |
+
|
432 |
+
def download_by_resolution(
|
433 |
+
youtube: YouTube, resolution: str, target: Optional[str] = None
|
434 |
+
) -> None:
|
435 |
+
"""Start downloading a YouTube video.
|
436 |
+
|
437 |
+
:param YouTube youtube:
|
438 |
+
A valid YouTube object.
|
439 |
+
:param str resolution:
|
440 |
+
YouTube video resolution.
|
441 |
+
:param str target:
|
442 |
+
Target directory for download
|
443 |
+
"""
|
444 |
+
# TODO(nficano): allow dash itags to be selected
|
445 |
+
stream = youtube.streams.get_by_resolution(resolution)
|
446 |
+
if stream is None:
|
447 |
+
print(f"Could not find a stream with resolution: {resolution}")
|
448 |
+
print("Try one of these:")
|
449 |
+
display_streams(youtube)
|
450 |
+
sys.exit()
|
451 |
+
|
452 |
+
youtube.register_on_progress_callback(on_progress)
|
453 |
+
|
454 |
+
try:
|
455 |
+
_download(stream, target=target)
|
456 |
+
except KeyboardInterrupt:
|
457 |
+
sys.exit()
|
458 |
+
|
459 |
+
|
460 |
+
def download_highest_resolution_progressive(
|
461 |
+
youtube: YouTube, resolution: str, target: Optional[str] = None
|
462 |
+
) -> None:
|
463 |
+
"""Start downloading the highest resolution progressive stream.
|
464 |
+
|
465 |
+
:param YouTube youtube:
|
466 |
+
A valid YouTube object.
|
467 |
+
:param str resolution:
|
468 |
+
YouTube video resolution.
|
469 |
+
:param str target:
|
470 |
+
Target directory for download
|
471 |
+
"""
|
472 |
+
youtube.register_on_progress_callback(on_progress)
|
473 |
+
try:
|
474 |
+
stream = youtube.streams.get_highest_resolution()
|
475 |
+
except exceptions.VideoUnavailable as err:
|
476 |
+
print(f"No video streams available: {err}")
|
477 |
+
else:
|
478 |
+
try:
|
479 |
+
_download(stream, target=target)
|
480 |
+
except KeyboardInterrupt:
|
481 |
+
sys.exit()
|
482 |
+
|
483 |
+
|
484 |
+
def display_streams(youtube: YouTube) -> None:
|
485 |
+
"""Probe YouTube video and lists its available formats.
|
486 |
+
|
487 |
+
:param YouTube youtube:
|
488 |
+
A valid YouTube watch URL.
|
489 |
+
|
490 |
+
"""
|
491 |
+
for stream in youtube.streams:
|
492 |
+
print(stream)
|
493 |
+
|
494 |
+
|
495 |
+
def _print_available_captions(captions: CaptionQuery) -> None:
|
496 |
+
print(
|
497 |
+
f"Available caption codes are: {', '.join(c.code for c in captions)}"
|
498 |
+
)
|
499 |
+
|
500 |
+
|
501 |
+
def download_caption(
|
502 |
+
youtube: YouTube, lang_code: Optional[str], target: Optional[str] = None
|
503 |
+
) -> None:
|
504 |
+
"""Download a caption for the YouTube video.
|
505 |
+
|
506 |
+
:param YouTube youtube:
|
507 |
+
A valid YouTube object.
|
508 |
+
:param str lang_code:
|
509 |
+
Language code desired for caption file.
|
510 |
+
Prints available codes if the value is None
|
511 |
+
or the desired code is not available.
|
512 |
+
:param str target:
|
513 |
+
Target directory for download
|
514 |
+
"""
|
515 |
+
try:
|
516 |
+
caption = youtube.captions[lang_code]
|
517 |
+
downloaded_path = caption.download(
|
518 |
+
title=youtube.title, output_path=target
|
519 |
+
)
|
520 |
+
print(f"Saved caption file to: {downloaded_path}")
|
521 |
+
except KeyError:
|
522 |
+
print(f"Unable to find caption with code: {lang_code}")
|
523 |
+
_print_available_captions(youtube.captions)
|
524 |
+
|
525 |
+
|
526 |
+
def download_audio(
|
527 |
+
youtube: YouTube, filetype: str, target: Optional[str] = None
|
528 |
+
) -> None:
|
529 |
+
"""
|
530 |
+
Given a filetype, downloads the highest quality available audio stream for a
|
531 |
+
YouTube video.
|
532 |
+
|
533 |
+
:param YouTube youtube:
|
534 |
+
A valid YouTube object.
|
535 |
+
:param str filetype:
|
536 |
+
Desired file format to download.
|
537 |
+
:param str target:
|
538 |
+
Target directory for download
|
539 |
+
"""
|
540 |
+
audio = (
|
541 |
+
youtube.streams.filter(only_audio=True, subtype=filetype)
|
542 |
+
.order_by("abr")
|
543 |
+
.last()
|
544 |
+
)
|
545 |
+
|
546 |
+
if audio is None:
|
547 |
+
print("No audio only stream found. Try one of these:")
|
548 |
+
display_streams(youtube)
|
549 |
+
sys.exit()
|
550 |
+
|
551 |
+
youtube.register_on_progress_callback(on_progress)
|
552 |
+
|
553 |
+
try:
|
554 |
+
_download(audio, target=target)
|
555 |
+
except KeyboardInterrupt:
|
556 |
+
sys.exit()
|
557 |
+
|
558 |
+
|
559 |
+
if __name__ == "__main__":
|
560 |
+
main()
|
pytube/contrib/__init__.py
ADDED
File without changes
|
pytube/contrib/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (150 Bytes). View file
|
|
pytube/contrib/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (150 Bytes). View file
|
|