wldmr commited on
Commit
837fdb6
1 Parent(s): 84ea2c9
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +44 -48
  2. frames.py +102 -0
  3. lexrank.py +24 -0
  4. myrpunct/__init__.py +2 -0
  5. myrpunct/__pycache__/__init__.cpython-310.pyc +0 -0
  6. myrpunct/__pycache__/__init__.cpython-39.pyc +0 -0
  7. myrpunct/__pycache__/punctuate.cpython-310.pyc +0 -0
  8. myrpunct/__pycache__/punctuate.cpython-39.pyc +0 -0
  9. myrpunct/punctuate.py +174 -0
  10. myrpunct/utils.py +34 -0
  11. pytube/__init__.py +19 -0
  12. pytube/__main__.py +467 -0
  13. pytube/__pycache__/__init__.cpython-310.pyc +0 -0
  14. pytube/__pycache__/__init__.cpython-39.pyc +0 -0
  15. pytube/__pycache__/__main__.cpython-310.pyc +0 -0
  16. pytube/__pycache__/__main__.cpython-39.pyc +0 -0
  17. pytube/__pycache__/captions.cpython-310.pyc +0 -0
  18. pytube/__pycache__/captions.cpython-39.pyc +0 -0
  19. pytube/__pycache__/cipher.cpython-310.pyc +0 -0
  20. pytube/__pycache__/cipher.cpython-39.pyc +0 -0
  21. pytube/__pycache__/exceptions.cpython-310.pyc +0 -0
  22. pytube/__pycache__/exceptions.cpython-39.pyc +0 -0
  23. pytube/__pycache__/extract.cpython-310.pyc +0 -0
  24. pytube/__pycache__/extract.cpython-39.pyc +0 -0
  25. pytube/__pycache__/helpers.cpython-310.pyc +0 -0
  26. pytube/__pycache__/helpers.cpython-39.pyc +0 -0
  27. pytube/__pycache__/innertube.cpython-310.pyc +0 -0
  28. pytube/__pycache__/innertube.cpython-39.pyc +0 -0
  29. pytube/__pycache__/itags.cpython-310.pyc +0 -0
  30. pytube/__pycache__/itags.cpython-39.pyc +0 -0
  31. pytube/__pycache__/metadata.cpython-310.pyc +0 -0
  32. pytube/__pycache__/metadata.cpython-39.pyc +0 -0
  33. pytube/__pycache__/monostate.cpython-310.pyc +0 -0
  34. pytube/__pycache__/monostate.cpython-39.pyc +0 -0
  35. pytube/__pycache__/parser.cpython-310.pyc +0 -0
  36. pytube/__pycache__/parser.cpython-39.pyc +0 -0
  37. pytube/__pycache__/query.cpython-310.pyc +0 -0
  38. pytube/__pycache__/query.cpython-39.pyc +0 -0
  39. pytube/__pycache__/request.cpython-310.pyc +0 -0
  40. pytube/__pycache__/request.cpython-39.pyc +0 -0
  41. pytube/__pycache__/streams.cpython-310.pyc +0 -0
  42. pytube/__pycache__/streams.cpython-39.pyc +0 -0
  43. pytube/__pycache__/version.cpython-310.pyc +0 -0
  44. pytube/__pycache__/version.cpython-39.pyc +0 -0
  45. pytube/captions.py +154 -0
  46. pytube/cipher.py +697 -0
  47. pytube/cli.py +560 -0
  48. pytube/contrib/__init__.py +0 -0
  49. pytube/contrib/__pycache__/__init__.cpython-310.pyc +0 -0
  50. pytube/contrib/__pycache__/__init__.cpython-39.pyc +0 -0
app.py CHANGED
@@ -1,55 +1,51 @@
1
- # main.py
2
-
3
- from fastapi import FastAPI
4
  from PIL import Image
5
- import base64
6
- from fastapi.responses import HTMLResponse, FileResponse
7
-
8
- app = FastAPI()
9
-
10
-
11
- @app.get("/")
12
- async def root():
13
- return FileResponse(path="static/index.html", media_type="text/html")
14
-
15
- @app.get("/html")
16
- async def root():
17
- """Basic HTML response."""
18
- body = (
19
- "<html>"
20
- "<body style='padding: 10px;'>"
21
- "<h1>Welcome to the API</h1>"
22
- "<div>"
23
- "Check the docs: <a href='/docs'>here</a>"
24
- "</div>"
25
- "</body>"
26
- "</html>"
27
- )
28
-
29
- return HTMLResponse(content=body)
30
-
31
- @app.get("/api")
32
- async def cal_api():
33
  images = []
 
 
 
 
 
 
34
 
35
- with open('workdir/lion.jpg', 'rb') as open_file:
36
- byte_content = open_file.read()
37
- base64_bytes = base64.b64encode(byte_content)
38
- base64_string = base64_bytes.decode('utf-8')
39
- images.append(base64_string)
 
 
 
 
 
40
 
41
- with open('workdir/cheetah.jpg', 'rb') as open_file:
42
- byte_content = open_file.read()
43
- base64_bytes = base64.b64encode(byte_content)
44
- base64_string = base64_bytes.decode('utf-8')
45
- images.append(base64_string)
46
 
47
- #image_path='lion.jpg'
48
- #pilim = Image.open(image_path)
49
- #pilimrot = pilim.rotate(45)
50
- return {"data": images}
51
 
52
- @app.get("/items/{item_id}")
53
- async def read_item(item_id):
54
- return {"item_id": item_id}
55
 
 
 
 
1
+ import gradio as gr
 
 
2
  from PIL import Image
3
+ import os
4
+ import summarizer as su
5
+ import nltk
6
+
7
+
8
+ def image_mod(rpunkt_switch, link):
9
+
10
+ if len(link)==0:
11
+ return 'Error: No link provided', None
12
+
13
+ nltk_file = 'nltk_data/tokenizers/punkt.zip'
14
+ home_pc = '/Users/hujo/'
15
+ home_hf = '/home/user/'
16
+ if os.path.exists(home_pc+nltk_file) or os.path.exists(home_hf+nltk_file):
17
+ print('nltk punkt file exists in ', nltk_file)
18
+ else:
19
+ nltk.download('punkt')
20
+
21
+ #link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE'
22
+ lexrank_switch = True
23
+ html = ''
 
 
 
 
 
 
 
24
  images = []
25
+ html, images = su.getSummary(link, lexrank_switch, rpunkt_switch)
26
+ #images = su.getSummaryImage(link, lexrank_switch, rpunkt_switch)
27
+ print(html)
28
+
29
+ files = os.listdir('workdir/')
30
+ print('local files: ',files)
31
 
32
+ #image_path = 'workdir/lion.jpg'
33
+ #im = Image.open(image_path)
34
+ #images.append(im)
35
+ #with Image.open(open(image_path,'rb')) as im:
36
+ # images.append(im)
37
+ #images.append(im.rotate(90))
38
+
39
+ #images[0].save("newlion.png")
40
+
41
+ print('images',images)
42
 
43
+ return html, images
 
 
 
 
44
 
 
 
 
 
45
 
46
+ demo = gr.Interface(image_mod,
47
+ [gr.Checkbox(label='Restore runctuation'), "text"] , ["html", gr.Gallery()],
48
+ allow_flagging="never")
49
 
50
+ if __name__ == "__main__":
51
+ demo.launch()
frames.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ast import Try
2
+ import subprocess as sp
3
+ import os
4
+
5
+ # show current venv: echo $VIRTUAL_ENV
6
+ # import sys
7
+ # del sys.modules['frames']
8
+
9
+ # transcript module
10
+ # 1. extract timestamps from transcript
11
+ # 2. extract captions from transcript
12
+ # this module
13
+ # 3. extract frames at timestamps
14
+ # 4. add caption to each frame
15
+ # 5. convert images to mp4 video
16
+
17
+ # converts a list of images to a mp4 video
18
+ def convertImageToVideo():
19
+ cmd = "ffmpeg -y -f image2 -i frame_%04d.jpg output_video.mp4"
20
+ cmd_call = cmd.split()
21
+ working_dir = './workdir'
22
+
23
+ with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
24
+ result = proc.stderr.read()
25
+
26
+ return [proc.wait(),result]
27
+
28
+
29
+ # extract a frame as jpg image file
30
+ # from a video at a given timestamp
31
+ # num=0; for p in $(cat timestamps); do ((num++)); printf "$num $p\r"; dnum=$(printf "%03d" "$num"); ffmpeg -ss $p -i "$mp4file" -frames:v 1 out_$dnum.jpg >& ffmpeg.out; done
32
+ def extractImagesFromVideo(timestamps):
33
+ working_dir = './workdir'
34
+ input_file = 'input_video.mp4'
35
+ if not os.path.isfile(working_dir+'/'+input_file):
36
+ return 'Error: File '+input_file+' is missing, create the file first.'
37
+
38
+
39
+ # create a working directory for the files
40
+ if not os.path.isdir(working_dir):
41
+ print('There is no working directory. Create a new one.')
42
+ os.mkdir(working_dir)
43
+
44
+ proc_list = []
45
+ for current_frame, current_timestamp in enumerate(timestamps, start=1):
46
+ print(f"{current_frame:04d}", current_timestamp)
47
+ cmd = 'ffmpeg -y -ss '+str(current_timestamp)+' -i '+input_file+' -frames:v 1 frame_'+f"{current_frame:04d}"+'.jpg'
48
+ cmd_call = cmd.split()
49
+
50
+ with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
51
+ proc_list.append(proc.wait())
52
+
53
+ return proc_list
54
+
55
+ # add caption to each image
56
+ # 'convert' porgram is from the 'imagemagick' package
57
+ # num=0; while read p; do ((num++)); dnum=$(printf "%03d" "$num"); printf "$dnum $p\r"; convert out_$dnum.jpg -undercolor Black -fill white -gravity South -pointsize 25 -annotate +0+10 "$p" out_$dnum.jpg >& ffmpeg.out; done<srt.txt
58
+ def addCaptionToImage(caption):
59
+ proc_list = []
60
+ for current_frame, current_caption in enumerate(caption.split('\n'), start=1):
61
+ print(f"{current_frame:04d}", current_caption)
62
+ #current_frame=182
63
+ #current_caption='with this method as compared to just'
64
+
65
+ cmd = 'convert frame_'+f"{current_frame:04d}"+'.jpg -undercolor Black -fill white -gravity South -pointsize 25 -annotate +0+10'
66
+ cmd_call = cmd.split()
67
+ # the 'split' command would also split the input caption
68
+ # therefore it has to be added to the array after the split
69
+ cmd_call.append(current_caption)
70
+ cmd_call.append('frame_'+f"{current_frame:04d}"+'.jpg')
71
+ #cmd_call
72
+ working_dir = './workdir'
73
+
74
+ with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
75
+ proc_list.append(proc.wait())
76
+
77
+ return proc_list
78
+
79
+
80
+ def removeFilesInWorkdir():
81
+ result =''
82
+ working_dir = './workdir'
83
+ try:
84
+ for f in os.listdir(working_dir):
85
+ os.remove(os.path.join(working_dir, f))
86
+ except:
87
+ result = 'Error: Not all files could be removed.'
88
+
89
+ return result
90
+
91
+ def renameOutputVideo(filenme):
92
+ result = ''
93
+ working_dir = './workdir'
94
+ shelf_dir = './shelf'
95
+ input_filename = working_dir+'/'+'output_video.mp4'
96
+ output_filename = shelf_dir+'/'+filenme+'.mp4'
97
+ try:
98
+ os.rename(input_filename,output_filename)
99
+ except:
100
+ result = 'Error: Could not rename file.'
101
+
102
+ return result
lexrank.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import nltk
2
+ #nltk.download('punkt')
3
+
4
+ from sumy.parsers.html import HtmlParser
5
+ from sumy.parsers.plaintext import PlaintextParser
6
+ from sumy.nlp.tokenizers import Tokenizer
7
+ from sumy.summarizers.lex_rank import LexRankSummarizer
8
+ from sumy.nlp.stemmers import Stemmer
9
+ from sumy.utils import get_stop_words
10
+
11
+ def getSummary(text, nr_sentences):
12
+ summary=[]
13
+ LANGUAGE = "english"
14
+ SENTENCES_COUNT = nr_sentences
15
+ #parser = PlaintextParser.from_file("/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt", Tokenizer(LANGUAGE))
16
+ parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
17
+ #print(parser.document)
18
+ stemmer = Stemmer(LANGUAGE)
19
+ summarizer = LexRankSummarizer(stemmer)
20
+ summarizer.stop_words = get_stop_words(LANGUAGE)
21
+ for sentence in summarizer(parser.document, SENTENCES_COUNT):
22
+ summary.append(sentence)
23
+
24
+ return summary
myrpunct/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .punctuate import RestorePuncts
2
+ print("init executed ...")
myrpunct/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (231 Bytes). View file
 
myrpunct/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (227 Bytes). View file
 
myrpunct/__pycache__/punctuate.cpython-310.pyc ADDED
Binary file (5.71 kB). View file
 
myrpunct/__pycache__/punctuate.cpython-39.pyc ADDED
Binary file (5.69 kB). View file
 
myrpunct/punctuate.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # 💾⚙️🔮
3
+
4
+ __author__ = "Daulet N."
5
+ __email__ = "daulet.nurmanbetov@gmail.com"
6
+
7
+ import logging
8
+ from langdetect import detect
9
+ from simpletransformers.ner import NERModel, NERArgs
10
+
11
+
12
+ class RestorePuncts:
13
+ def __init__(self, wrds_per_pred=250, use_cuda=False):
14
+ self.wrds_per_pred = wrds_per_pred
15
+ self.overlap_wrds = 30
16
+ self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
17
+ self.model_hf = "wldmr/felflare-bert-restore-punctuation"
18
+ self.model_args = NERArgs()
19
+ self.model_args.silent = True
20
+ self.model_args.max_seq_length = 512
21
+ #self.model_args.use_multiprocessing = False
22
+ self.model = NERModel("bert", self.model_hf, labels=self.valid_labels, use_cuda=use_cuda, args=self.model_args)
23
+ #self.model = NERModel("bert", self.model_hf, labels=self.valid_labels, use_cuda=use_cuda, args={"silent": True, "max_seq_length": 512, "use_multiprocessing": False})
24
+ print("class init ...")
25
+ print("use_multiprocessing: ",self.model_args.use_multiprocessing)
26
+
27
+ def status(self):
28
+ print("function called")
29
+
30
+ def punctuate(self, text: str, lang:str=''):
31
+ """
32
+ Performs punctuation restoration on arbitrarily large text.
33
+ Detects if input is not English, if non-English was detected terminates predictions.
34
+ Overrride by supplying `lang='en'`
35
+
36
+ Args:
37
+ - text (str): Text to punctuate, can be few words to as large as you want.
38
+ - lang (str): Explicit language of input text.
39
+ """
40
+ if not lang and len(text) > 10:
41
+ lang = detect(text)
42
+ if lang != 'en':
43
+ raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
44
+ If you are certain the input is English, pass argument lang='en' to this function.
45
+ Punctuate received: {text}""")
46
+
47
+ # plit up large text into bert digestable chunks
48
+ splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
49
+ # predict slices
50
+ # full_preds_lst contains tuple of labels and logits
51
+ full_preds_lst = [self.predict(i['text']) for i in splits]
52
+ # extract predictions, and discard logits
53
+ preds_lst = [i[0][0] for i in full_preds_lst]
54
+ # join text slices
55
+ combined_preds = self.combine_results(text, preds_lst)
56
+ # create punctuated prediction
57
+ punct_text = self.punctuate_texts(combined_preds)
58
+ return punct_text
59
+
60
+ def predict(self, input_slice):
61
+ """
62
+ Passes the unpunctuated text to the model for punctuation.
63
+ """
64
+ predictions, raw_outputs = self.model.predict([input_slice])
65
+ return predictions, raw_outputs
66
+
67
+ @staticmethod
68
+ def split_on_toks(text, length, overlap):
69
+ """
70
+ Splits text into predefined slices of overlapping text with indexes (offsets)
71
+ that tie-back to original text.
72
+ This is done to bypass 512 token limit on transformer models by sequentially
73
+ feeding chunks of < 512 toks.
74
+ Example output:
75
+ [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
76
+ """
77
+ wrds = text.replace('\n', ' ').split(" ")
78
+ resp = []
79
+ lst_chunk_idx = 0
80
+ i = 0
81
+
82
+ while True:
83
+ # words in the chunk and the overlapping portion
84
+ wrds_len = wrds[(length * i):(length * (i + 1))]
85
+ wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
86
+ wrds_split = wrds_len + wrds_ovlp
87
+
88
+ # Break loop if no more words
89
+ if not wrds_split:
90
+ break
91
+
92
+ wrds_str = " ".join(wrds_split)
93
+ nxt_chunk_start_idx = len(" ".join(wrds_len))
94
+ lst_char_idx = len(" ".join(wrds_split))
95
+
96
+ resp_obj = {
97
+ "text": wrds_str,
98
+ "start_idx": lst_chunk_idx,
99
+ "end_idx": lst_char_idx + lst_chunk_idx,
100
+ }
101
+
102
+ resp.append(resp_obj)
103
+ lst_chunk_idx += nxt_chunk_start_idx + 1
104
+ i += 1
105
+ logging.info(f"Sliced transcript into {len(resp)} slices.")
106
+ return resp
107
+
108
+ @staticmethod
109
+ def combine_results(full_text: str, text_slices):
110
+ """
111
+ Given a full text and predictions of each slice combines predictions into a single text again.
112
+ Performs validataion wether text was combined correctly
113
+ """
114
+ split_full_text = full_text.replace('\n', ' ').split(" ")
115
+ split_full_text = [i for i in split_full_text if i]
116
+ split_full_text_len = len(split_full_text)
117
+ output_text = []
118
+ index = 0
119
+
120
+ if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
121
+ text_slices = text_slices[:-1]
122
+
123
+ for _slice in text_slices:
124
+ slice_wrds = len(_slice)
125
+ for ix, wrd in enumerate(_slice):
126
+ # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
127
+ if index == split_full_text_len:
128
+ break
129
+
130
+ if split_full_text[index] == str(list(wrd.keys())[0]) and \
131
+ ix <= slice_wrds - 3 and text_slices[-1] != _slice:
132
+ index += 1
133
+ pred_item_tuple = list(wrd.items())[0]
134
+ output_text.append(pred_item_tuple)
135
+ elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
136
+ index += 1
137
+ pred_item_tuple = list(wrd.items())[0]
138
+ output_text.append(pred_item_tuple)
139
+ assert [i[0] for i in output_text] == split_full_text
140
+ return output_text
141
+
142
+ @staticmethod
143
+ def punctuate_texts(full_pred: list):
144
+ """
145
+ Given a list of Predictions from the model, applies the predictions to text,
146
+ thus punctuating it.
147
+ """
148
+ punct_resp = ""
149
+ for i in full_pred:
150
+ word, label = i
151
+ if label[-1] == "U":
152
+ punct_wrd = word.capitalize()
153
+ else:
154
+ punct_wrd = word
155
+
156
+ if label[0] != "O":
157
+ punct_wrd += label[0]
158
+
159
+ punct_resp += punct_wrd + " "
160
+ punct_resp = punct_resp.strip()
161
+ # Append trailing period if doesnt exist.
162
+ if punct_resp[-1].isalnum():
163
+ punct_resp += "."
164
+ return punct_resp
165
+
166
+
167
+ if __name__ == "__main__":
168
+ punct_model = RestorePuncts()
169
+ # read test file
170
+ with open('../tests/sample_text.txt', 'r') as fp:
171
+ test_sample = fp.read()
172
+ # predict text and print
173
+ punctuated = punct_model.punctuate(test_sample)
174
+ print(punctuated)
myrpunct/utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # 💾⚙️🔮
3
+
4
+ __author__ = "Daulet N."
5
+ __email__ = "daulet.nurmanbetov@gmail.com"
6
+
7
+ def prepare_unpunct_text(text):
8
+ """
9
+ Given a text, normalizes it to subsequently restore punctuation
10
+ """
11
+ formatted_txt = text.replace('\n', '').strip()
12
+ formatted_txt = formatted_txt.lower()
13
+ formatted_txt_lst = formatted_txt.split(" ")
14
+ punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
15
+ normalized_txt = " ".join([i for i in punct_strp_txt if i])
16
+ return normalized_txt
17
+
18
+ def strip_punct(wrd):
19
+ """
20
+ Given a word, strips non aphanumeric characters that precede and follow it
21
+ """
22
+ if not wrd:
23
+ return wrd
24
+
25
+ while not wrd[-1:].isalnum():
26
+ if not wrd:
27
+ break
28
+ wrd = wrd[:-1]
29
+
30
+ while not wrd[:1].isalnum():
31
+ if not wrd:
32
+ break
33
+ wrd = wrd[1:]
34
+ return wrd
pytube/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa: F401
2
+ # noreorder
3
+ """
4
+ Pytube: a very serious Python library for downloading YouTube Videos.
5
+ """
6
+ __title__ = "pytube"
7
+ __author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano"
8
+ __license__ = "The Unlicense (Unlicense)"
9
+ __js__ = None
10
+ __js_url__ = None
11
+
12
+ from pytube.version import __version__
13
+ from pytube.streams import Stream
14
+ from pytube.captions import Caption
15
+ from pytube.query import CaptionQuery, StreamQuery
16
+ from pytube.__main__ import YouTube
17
+ from pytube.contrib.playlist import Playlist
18
+ from pytube.contrib.channel import Channel
19
+ from pytube.contrib.search import Search
pytube/__main__.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module implements the core developer interface for pytube.
3
+
4
+ The problem domain of the :class:`YouTube <YouTube> class focuses almost
5
+ exclusively on the developer interface. Pytube offloads the heavy lifting to
6
+ smaller peripheral modules and functions.
7
+
8
+ """
9
+ import logging
10
+ from typing import Any, Callable, Dict, List, Optional
11
+
12
+ import pytube
13
+ import pytube.exceptions as exceptions
14
+ from pytube import extract, request
15
+ from pytube import Stream, StreamQuery
16
+ from pytube.helpers import install_proxy
17
+ from pytube.innertube import InnerTube
18
+ from pytube.metadata import YouTubeMetadata
19
+ from pytube.monostate import Monostate
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class YouTube:
25
+ """Core developer interface for pytube."""
26
+
27
+ def __init__(
28
+ self,
29
+ url: str,
30
+ on_progress_callback: Optional[Callable[[Any, bytes, int], None]] = None,
31
+ on_complete_callback: Optional[Callable[[Any, Optional[str]], None]] = None,
32
+ proxies: Dict[str, str] = None,
33
+ use_oauth: bool = False,
34
+ allow_oauth_cache: bool = True
35
+ ):
36
+ """Construct a :class:`YouTube <YouTube>`.
37
+
38
+ :param str url:
39
+ A valid YouTube watch URL.
40
+ :param func on_progress_callback:
41
+ (Optional) User defined callback function for stream download
42
+ progress events.
43
+ :param func on_complete_callback:
44
+ (Optional) User defined callback function for stream download
45
+ complete events.
46
+ :param dict proxies:
47
+ (Optional) A dict mapping protocol to proxy address which will be used by pytube.
48
+ :param bool use_oauth:
49
+ (Optional) Prompt the user to authenticate to YouTube.
50
+ If allow_oauth_cache is set to True, the user should only be prompted once.
51
+ :param bool allow_oauth_cache:
52
+ (Optional) Cache OAuth tokens locally on the machine. Defaults to True.
53
+ These tokens are only generated if use_oauth is set to True as well.
54
+ """
55
+ self._js: Optional[str] = None # js fetched by js_url
56
+ self._js_url: Optional[str] = None # the url to the js, parsed from watch html
57
+
58
+ self._vid_info: Optional[Dict] = None # content fetched from innertube/player
59
+
60
+ self._watch_html: Optional[str] = None # the html of /watch?v=<video_id>
61
+ self._embed_html: Optional[str] = None
62
+ self._player_config_args: Optional[Dict] = None # inline js in the html containing
63
+ self._age_restricted: Optional[bool] = None
64
+
65
+ self._fmt_streams: Optional[List[Stream]] = None
66
+
67
+ self._initial_data = None
68
+ self._metadata: Optional[YouTubeMetadata] = None
69
+
70
+ # video_id part of /watch?v=<video_id>
71
+ self.video_id = extract.video_id(url)
72
+
73
+ self.watch_url = f"https://youtube.com/watch?v={self.video_id}"
74
+ self.embed_url = f"https://www.youtube.com/embed/{self.video_id}"
75
+
76
+ # Shared between all instances of `Stream` (Borg pattern).
77
+ self.stream_monostate = Monostate(
78
+ on_progress=on_progress_callback, on_complete=on_complete_callback
79
+ )
80
+
81
+ if proxies:
82
+ install_proxy(proxies)
83
+
84
+ self._author = None
85
+ self._title = None
86
+ self._publish_date = None
87
+
88
+ self.use_oauth = use_oauth
89
+ self.allow_oauth_cache = allow_oauth_cache
90
+
91
+ def __repr__(self):
92
+ return f'<pytube.__main__.YouTube object: videoId={self.video_id}>'
93
+
94
+ def __eq__(self, o: object) -> bool:
95
+ # Compare types and urls, if they're same return true, else return false.
96
+ return type(o) == type(self) and o.watch_url == self.watch_url
97
+
98
+ @property
99
+ def watch_html(self):
100
+ if self._watch_html:
101
+ return self._watch_html
102
+ self._watch_html = request.get(url=self.watch_url)
103
+ return self._watch_html
104
+
105
+ @property
106
+ def embed_html(self):
107
+ if self._embed_html:
108
+ return self._embed_html
109
+ self._embed_html = request.get(url=self.embed_url)
110
+ return self._embed_html
111
+
112
+ @property
113
+ def age_restricted(self):
114
+ if self._age_restricted:
115
+ return self._age_restricted
116
+ self._age_restricted = extract.is_age_restricted(self.watch_html)
117
+ return self._age_restricted
118
+
119
+ @property
120
+ def js_url(self):
121
+ if self._js_url:
122
+ return self._js_url
123
+
124
+ if self.age_restricted:
125
+ self._js_url = extract.js_url(self.embed_html)
126
+ else:
127
+ self._js_url = extract.js_url(self.watch_html)
128
+
129
+ return self._js_url
130
+
131
+ @property
132
+ def js(self):
133
+ if self._js:
134
+ return self._js
135
+
136
+ # If the js_url doesn't match the cached url, fetch the new js and update
137
+ # the cache; otherwise, load the cache.
138
+ if pytube.__js_url__ != self.js_url:
139
+ self._js = request.get(self.js_url)
140
+ pytube.__js__ = self._js
141
+ pytube.__js_url__ = self.js_url
142
+ else:
143
+ self._js = pytube.__js__
144
+
145
+ return self._js
146
+
147
+ @property
148
+ def initial_data(self):
149
+ if self._initial_data:
150
+ return self._initial_data
151
+ self._initial_data = extract.initial_data(self.watch_html)
152
+ return self._initial_data
153
+
154
+ @property
155
+ def streaming_data(self):
156
+ """Return streamingData from video info."""
157
+ if 'streamingData' in self.vid_info:
158
+ return self.vid_info['streamingData']
159
+ else:
160
+ self.bypass_age_gate()
161
+ return self.vid_info['streamingData']
162
+
163
+ @property
164
+ def fmt_streams(self):
165
+ """Returns a list of streams if they have been initialized.
166
+
167
+ If the streams have not been initialized, finds all relevant
168
+ streams and initializes them.
169
+ """
170
+ self.check_availability()
171
+ if self._fmt_streams:
172
+ return self._fmt_streams
173
+
174
+ self._fmt_streams = []
175
+
176
+ stream_manifest = extract.apply_descrambler(self.streaming_data)
177
+
178
+ # If the cached js doesn't work, try fetching a new js file
179
+ # https://github.com/pytube/pytube/issues/1054
180
+ try:
181
+ extract.apply_signature(stream_manifest, self.vid_info, self.js)
182
+ except exceptions.ExtractError:
183
+ # To force an update to the js file, we clear the cache and retry
184
+ self._js = None
185
+ self._js_url = None
186
+ pytube.__js__ = None
187
+ pytube.__js_url__ = None
188
+ extract.apply_signature(stream_manifest, self.vid_info, self.js)
189
+
190
+ # build instances of :class:`Stream <Stream>`
191
+ # Initialize stream objects
192
+ for stream in stream_manifest:
193
+ video = Stream(
194
+ stream=stream,
195
+ monostate=self.stream_monostate,
196
+ )
197
+ self._fmt_streams.append(video)
198
+
199
+ self.stream_monostate.title = self.title
200
+ self.stream_monostate.duration = self.length
201
+
202
+ return self._fmt_streams
203
+
204
+ def check_availability(self):
205
+ """Check whether the video is available.
206
+
207
+ Raises different exceptions based on why the video is unavailable,
208
+ otherwise does nothing.
209
+ """
210
+ status, messages = extract.playability_status(self.watch_html)
211
+
212
+ for reason in messages:
213
+ if status == 'UNPLAYABLE':
214
+ if reason == (
215
+ 'Join this channel to get access to members-only content '
216
+ 'like this video, and other exclusive perks.'
217
+ ):
218
+ raise exceptions.MembersOnly(video_id=self.video_id)
219
+ elif reason == 'This live stream recording is not available.':
220
+ raise exceptions.RecordingUnavailable(video_id=self.video_id)
221
+ else:
222
+ raise exceptions.VideoUnavailable(video_id=self.video_id)
223
+ elif status == 'LOGIN_REQUIRED':
224
+ if reason == (
225
+ 'This is a private video. '
226
+ 'Please sign in to verify that you may see it.'
227
+ ):
228
+ raise exceptions.VideoPrivate(video_id=self.video_id)
229
+ elif status == 'ERROR':
230
+ if reason == 'Video unavailable':
231
+ raise exceptions.VideoUnavailable(video_id=self.video_id)
232
+ elif status == 'LIVE_STREAM':
233
+ raise exceptions.LiveStreamError(video_id=self.video_id)
234
+
235
+ @property
236
+ def vid_info(self):
237
+ """Parse the raw vid info and return the parsed result.
238
+
239
+ :rtype: Dict[Any, Any]
240
+ """
241
+ if self._vid_info:
242
+ return self._vid_info
243
+
244
+ innertube = InnerTube(use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache)
245
+
246
+ innertube_response = innertube.player(self.video_id)
247
+ self._vid_info = innertube_response
248
+ return self._vid_info
249
+
250
+ def bypass_age_gate(self):
251
+ """Attempt to update the vid_info by bypassing the age gate."""
252
+ innertube = InnerTube(
253
+ client='ANDROID_EMBED',
254
+ use_oauth=self.use_oauth,
255
+ allow_cache=self.allow_oauth_cache
256
+ )
257
+ innertube_response = innertube.player(self.video_id)
258
+
259
+ playability_status = innertube_response['playabilityStatus'].get('status', None)
260
+
261
+ # If we still can't access the video, raise an exception
262
+ # (tier 3 age restriction)
263
+ if playability_status == 'UNPLAYABLE':
264
+ raise exceptions.AgeRestrictedError(self.video_id)
265
+
266
+ self._vid_info = innertube_response
267
+
268
+ @property
269
+ def caption_tracks(self) -> List[pytube.Caption]:
270
+ """Get a list of :class:`Caption <Caption>`.
271
+
272
+ :rtype: List[Caption]
273
+ """
274
+ raw_tracks = (
275
+ self.vid_info.get("captions", {})
276
+ .get("playerCaptionsTracklistRenderer", {})
277
+ .get("captionTracks", [])
278
+ )
279
+ return [pytube.Caption(track) for track in raw_tracks]
280
+
281
+ @property
282
+ def captions(self) -> pytube.CaptionQuery:
283
+ """Interface to query caption tracks.
284
+
285
+ :rtype: :class:`CaptionQuery <CaptionQuery>`.
286
+ """
287
+ return pytube.CaptionQuery(self.caption_tracks)
288
+
289
+ @property
290
+ def streams(self) -> StreamQuery:
291
+ """Interface to query both adaptive (DASH) and progressive streams.
292
+
293
+ :rtype: :class:`StreamQuery <StreamQuery>`.
294
+ """
295
+ self.check_availability()
296
+ return StreamQuery(self.fmt_streams)
297
+
298
+ @property
299
+ def thumbnail_url(self) -> str:
300
+ """Get the thumbnail url image.
301
+
302
+ :rtype: str
303
+ """
304
+ thumbnail_details = (
305
+ self.vid_info.get("videoDetails", {})
306
+ .get("thumbnail", {})
307
+ .get("thumbnails")
308
+ )
309
+ if thumbnail_details:
310
+ thumbnail_details = thumbnail_details[-1] # last item has max size
311
+ return thumbnail_details["url"]
312
+
313
+ return f"https://img.youtube.com/vi/{self.video_id}/maxresdefault.jpg"
314
+
315
+ @property
316
+ def publish_date(self):
317
+ """Get the publish date.
318
+
319
+ :rtype: datetime
320
+ """
321
+ if self._publish_date:
322
+ return self._publish_date
323
+ self._publish_date = extract.publish_date(self.watch_html)
324
+ return self._publish_date
325
+
326
+ @publish_date.setter
327
+ def publish_date(self, value):
328
+ """Sets the publish date."""
329
+ self._publish_date = value
330
+
331
+ @property
332
+ def title(self) -> str:
333
+ """Get the video title.
334
+
335
+ :rtype: str
336
+ """
337
+ if self._title:
338
+ return self._title
339
+
340
+ try:
341
+ self._title = self.vid_info['videoDetails']['title']
342
+ except KeyError:
343
+ # Check_availability will raise the correct exception in most cases
344
+ # if it doesn't, ask for a report.
345
+ self.check_availability()
346
+ raise exceptions.PytubeError(
347
+ (
348
+ f'Exception while accessing title of {self.watch_url}. '
349
+ 'Please file a bug report at https://github.com/pytube/pytube'
350
+ )
351
+ )
352
+
353
+ return self._title
354
+
355
+ @title.setter
356
+ def title(self, value):
357
+ """Sets the title value."""
358
+ self._title = value
359
+
360
+ @property
361
+ def description(self) -> str:
362
+ """Get the video description.
363
+
364
+ :rtype: str
365
+ """
366
+ return self.vid_info.get("videoDetails", {}).get("shortDescription")
367
+
368
+ @property
369
+ def rating(self) -> float:
370
+ """Get the video average rating.
371
+
372
+ :rtype: float
373
+
374
+ """
375
+ return self.vid_info.get("videoDetails", {}).get("averageRating")
376
+
377
+ @property
378
+ def length(self) -> int:
379
+ """Get the video length in seconds.
380
+
381
+ :rtype: int
382
+ """
383
+ return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds'))
384
+
385
+ @property
386
+ def views(self) -> int:
387
+ """Get the number of the times the video has been viewed.
388
+
389
+ :rtype: int
390
+ """
391
+ return int(self.vid_info.get("videoDetails", {}).get("viewCount"))
392
+
393
+ @property
394
+ def author(self) -> str:
395
+ """Get the video author.
396
+ :rtype: str
397
+ """
398
+ if self._author:
399
+ return self._author
400
+ self._author = self.vid_info.get("videoDetails", {}).get(
401
+ "author", "unknown"
402
+ )
403
+ return self._author
404
+
405
+ @author.setter
406
+ def author(self, value):
407
+ """Set the video author."""
408
+ self._author = value
409
+
410
+ @property
411
+ def keywords(self) -> List[str]:
412
+ """Get the video keywords.
413
+
414
+ :rtype: List[str]
415
+ """
416
+ return self.vid_info.get('videoDetails', {}).get('keywords', [])
417
+
418
+ @property
419
+ def channel_id(self) -> str:
420
+ """Get the video poster's channel id.
421
+
422
+ :rtype: str
423
+ """
424
+ return self.vid_info.get('videoDetails', {}).get('channelId', None)
425
+
426
+ @property
427
+ def channel_url(self) -> str:
428
+ """Construct the channel url for the video's poster from the channel id.
429
+
430
+ :rtype: str
431
+ """
432
+ return f'https://www.youtube.com/channel/{self.channel_id}'
433
+
434
+ @property
435
+ def metadata(self) -> Optional[YouTubeMetadata]:
436
+ """Get the metadata for the video.
437
+
438
+ :rtype: YouTubeMetadata
439
+ """
440
+ if self._metadata:
441
+ return self._metadata
442
+ else:
443
+ self._metadata = extract.metadata(self.initial_data)
444
+ return self._metadata
445
+
446
+ def register_on_progress_callback(self, func: Callable[[Any, bytes, int], None]):
447
+ """Register a download progress callback function post initialization.
448
+
449
+ :param callable func:
450
+ A callback function that takes ``stream``, ``chunk``,
451
+ and ``bytes_remaining`` as parameters.
452
+
453
+ :rtype: None
454
+
455
+ """
456
+ self.stream_monostate.on_progress = func
457
+
458
+ def register_on_complete_callback(self, func: Callable[[Any, Optional[str]], None]):
459
+ """Register a download complete callback function post initialization.
460
+
461
+ :param callable func:
462
+ A callback function that takes ``stream`` and ``file_path``.
463
+
464
+ :rtype: None
465
+
466
+ """
467
+ self.stream_monostate.on_complete = func
pytube/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (830 Bytes). View file
 
pytube/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (819 Bytes). View file
 
pytube/__pycache__/__main__.cpython-310.pyc ADDED
Binary file (12.6 kB). View file
 
pytube/__pycache__/__main__.cpython-39.pyc ADDED
Binary file (12.8 kB). View file
 
pytube/__pycache__/captions.cpython-310.pyc ADDED
Binary file (4.95 kB). View file
 
pytube/__pycache__/captions.cpython-39.pyc ADDED
Binary file (4.92 kB). View file
 
pytube/__pycache__/cipher.cpython-310.pyc ADDED
Binary file (18.9 kB). View file
 
pytube/__pycache__/cipher.cpython-39.pyc ADDED
Binary file (18.9 kB). View file
 
pytube/__pycache__/exceptions.cpython-310.pyc ADDED
Binary file (5.01 kB). View file
 
pytube/__pycache__/exceptions.cpython-39.pyc ADDED
Binary file (5.55 kB). View file
 
pytube/__pycache__/extract.cpython-310.pyc ADDED
Binary file (15.4 kB). View file
 
pytube/__pycache__/extract.cpython-39.pyc ADDED
Binary file (15.5 kB). View file
 
pytube/__pycache__/helpers.cpython-310.pyc ADDED
Binary file (9.83 kB). View file
 
pytube/__pycache__/helpers.cpython-39.pyc ADDED
Binary file (9.84 kB). View file
 
pytube/__pycache__/innertube.cpython-310.pyc ADDED
Binary file (8.63 kB). View file
 
pytube/__pycache__/innertube.cpython-39.pyc ADDED
Binary file (8.63 kB). View file
 
pytube/__pycache__/itags.cpython-310.pyc ADDED
Binary file (2.78 kB). View file
 
pytube/__pycache__/itags.cpython-39.pyc ADDED
Binary file (2.26 kB). View file
 
pytube/__pycache__/metadata.cpython-310.pyc ADDED
Binary file (1.71 kB). View file
 
pytube/__pycache__/metadata.cpython-39.pyc ADDED
Binary file (1.71 kB). View file
 
pytube/__pycache__/monostate.cpython-310.pyc ADDED
Binary file (728 Bytes). View file
 
pytube/__pycache__/monostate.cpython-39.pyc ADDED
Binary file (695 Bytes). View file
 
pytube/__pycache__/parser.cpython-310.pyc ADDED
Binary file (3.94 kB). View file
 
pytube/__pycache__/parser.cpython-39.pyc ADDED
Binary file (3.93 kB). View file
 
pytube/__pycache__/query.cpython-310.pyc ADDED
Binary file (14.1 kB). View file
 
pytube/__pycache__/query.cpython-39.pyc ADDED
Binary file (14.3 kB). View file
 
pytube/__pycache__/request.cpython-310.pyc ADDED
Binary file (5.74 kB). View file
 
pytube/__pycache__/request.cpython-39.pyc ADDED
Binary file (5.69 kB). View file
 
pytube/__pycache__/streams.cpython-310.pyc ADDED
Binary file (10.9 kB). View file
 
pytube/__pycache__/streams.cpython-39.pyc ADDED
Binary file (10.8 kB). View file
 
pytube/__pycache__/version.cpython-310.pyc ADDED
Binary file (220 Bytes). View file
 
pytube/__pycache__/version.cpython-39.pyc ADDED
Binary file (214 Bytes). View file
 
pytube/captions.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import time
4
+ import xml.etree.ElementTree as ElementTree
5
+ from html import unescape
6
+ from typing import Dict, Optional
7
+
8
+ from pytube import request
9
+ from pytube.helpers import safe_filename, target_directory
10
+
11
+
12
+ class Caption:
13
+ """Container for caption tracks."""
14
+
15
+ def __init__(self, caption_track: Dict):
16
+ """Construct a :class:`Caption <Caption>`.
17
+
18
+ :param dict caption_track:
19
+ Caption track data extracted from ``watch_html``.
20
+ """
21
+ self.url = caption_track.get("baseUrl")
22
+
23
+ # Certain videos have runs instead of simpleText
24
+ # this handles that edge case
25
+ name_dict = caption_track['name']
26
+ if 'simpleText' in name_dict:
27
+ self.name = name_dict['simpleText']
28
+ else:
29
+ for el in name_dict['runs']:
30
+ if 'text' in el:
31
+ self.name = el['text']
32
+
33
+ # Use "vssId" instead of "languageCode", fix issue #779
34
+ self.code = caption_track["vssId"]
35
+ # Remove preceding '.' for backwards compatibility, e.g.:
36
+ # English -> vssId: .en, languageCode: en
37
+ # English (auto-generated) -> vssId: a.en, languageCode: en
38
+ self.code = self.code.strip('.')
39
+
40
+ @property
41
+ def xml_captions(self) -> str:
42
+ """Download the xml caption tracks."""
43
+ return request.get(self.url)
44
+
45
+ def generate_srt_captions(self) -> str:
46
+ """Generate "SubRip Subtitle" captions.
47
+
48
+ Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
49
+ recompiles them into the "SubRip Subtitle" format.
50
+ """
51
+ return self.xml_caption_to_srt(self.xml_captions)
52
+
53
+ @staticmethod
54
+ def float_to_srt_time_format(d: float) -> str:
55
+ """Convert decimal durations into proper srt format.
56
+
57
+ :rtype: str
58
+ :returns:
59
+ SubRip Subtitle (str) formatted time duration.
60
+
61
+ float_to_srt_time_format(3.89) -> '00:00:03,890'
62
+ """
63
+ fraction, whole = math.modf(d)
64
+ time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
65
+ ms = f"{fraction:.3f}".replace("0.", "")
66
+ return time_fmt + ms
67
+
68
+ def xml_caption_to_srt(self, xml_captions: str) -> str:
69
+ """Convert xml caption tracks to "SubRip Subtitle (srt)".
70
+
71
+ :param str xml_captions:
72
+ XML formatted caption tracks.
73
+ """
74
+ segments = []
75
+ root = ElementTree.fromstring(xml_captions)
76
+ for i, child in enumerate(list(root)):
77
+ text = child.text or ""
78
+ caption = unescape(text.replace("\n", " ").replace(" ", " "),)
79
+ try:
80
+ duration = float(child.attrib["dur"])
81
+ except KeyError:
82
+ duration = 0.0
83
+ start = float(child.attrib["start"])
84
+ end = start + duration
85
+ sequence_number = i + 1 # convert from 0-indexed to 1.
86
+ line = "{seq}\n{start} --> {end}\n{text}\n".format(
87
+ seq=sequence_number,
88
+ start=self.float_to_srt_time_format(start),
89
+ end=self.float_to_srt_time_format(end),
90
+ text=caption,
91
+ )
92
+ segments.append(line)
93
+ return "\n".join(segments).strip()
94
+
95
+ def download(
96
+ self,
97
+ title: str,
98
+ srt: bool = True,
99
+ output_path: Optional[str] = None,
100
+ filename_prefix: Optional[str] = None,
101
+ ) -> str:
102
+ """Write the media stream to disk.
103
+
104
+ :param title:
105
+ Output filename (stem only) for writing media file.
106
+ If one is not specified, the default filename is used.
107
+ :type title: str
108
+ :param srt:
109
+ Set to True to download srt, false to download xml. Defaults to True.
110
+ :type srt bool
111
+ :param output_path:
112
+ (optional) Output path for writing media file. If one is not
113
+ specified, defaults to the current working directory.
114
+ :type output_path: str or None
115
+ :param filename_prefix:
116
+ (optional) A string that will be prepended to the filename.
117
+ For example a number in a playlist or the name of a series.
118
+ If one is not specified, nothing will be prepended
119
+ This is separate from filename so you can use the default
120
+ filename but still add a prefix.
121
+ :type filename_prefix: str or None
122
+
123
+ :rtype: str
124
+ """
125
+ if title.endswith(".srt") or title.endswith(".xml"):
126
+ filename = ".".join(title.split(".")[:-1])
127
+ else:
128
+ filename = title
129
+
130
+ if filename_prefix:
131
+ filename = f"{safe_filename(filename_prefix)}{filename}"
132
+
133
+ filename = safe_filename(filename)
134
+
135
+ filename += f" ({self.code})"
136
+
137
+ if srt:
138
+ filename += ".srt"
139
+ else:
140
+ filename += ".xml"
141
+
142
+ file_path = os.path.join(target_directory(output_path), filename)
143
+
144
+ with open(file_path, "w", encoding="utf-8") as file_handle:
145
+ if srt:
146
+ file_handle.write(self.generate_srt_captions())
147
+ else:
148
+ file_handle.write(self.xml_captions)
149
+
150
+ return file_path
151
+
152
+ def __repr__(self):
153
+ """Printable object representation."""
154
+ return '<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
pytube/cipher.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains all logic necessary to decipher the signature.
3
+
4
+ YouTube's strategy to restrict downloading videos is to send a ciphered version
5
+ of the signature to the client, along with the decryption algorithm obfuscated
6
+ in JavaScript. For the clients to play the videos, JavaScript must take the
7
+ ciphered version, cycle it through a series of "transform functions," and then
8
+ signs the media URL with the output.
9
+
10
+ This module is responsible for (1) finding and extracting those "transform
11
+ functions" (2) maps them to Python equivalents and (3) taking the ciphered
12
+ signature and decoding it.
13
+
14
+ """
15
+ import logging
16
+ import re
17
+ from itertools import chain
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple
19
+
20
+ from pytube.exceptions import ExtractError, RegexMatchError
21
+ from pytube.helpers import cache, regex_search
22
+ from pytube.parser import find_object_from_startpoint, throttling_array_split
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class Cipher:
28
+ def __init__(self, js: str):
29
+ self.transform_plan: List[str] = get_transform_plan(js)
30
+ var_regex = re.compile(r"^\w+\W")
31
+ var_match = var_regex.search(self.transform_plan[0])
32
+ if not var_match:
33
+ raise RegexMatchError(
34
+ caller="__init__", pattern=var_regex.pattern
35
+ )
36
+ var = var_match.group(0)[:-1]
37
+ self.transform_map = get_transform_map(js, var)
38
+ self.js_func_patterns = [
39
+ r"\w+\.(\w+)\(\w,(\d+)\)",
40
+ r"\w+\[(\"\w+\")\]\(\w,(\d+)\)"
41
+ ]
42
+
43
+ self.throttling_plan = get_throttling_plan(js)
44
+ self.throttling_array = get_throttling_function_array(js)
45
+
46
+ self.calculated_n = None
47
+
48
+ def calculate_n(self, initial_n: list):
49
+ """Converts n to the correct value to prevent throttling."""
50
+ if self.calculated_n:
51
+ return self.calculated_n
52
+
53
+ # First, update all instances of 'b' with the list(initial_n)
54
+ for i in range(len(self.throttling_array)):
55
+ if self.throttling_array[i] == 'b':
56
+ self.throttling_array[i] = initial_n
57
+
58
+ for step in self.throttling_plan:
59
+ curr_func = self.throttling_array[int(step[0])]
60
+ if not callable(curr_func):
61
+ logger.debug(f'{curr_func} is not callable.')
62
+ logger.debug(f'Throttling array:\n{self.throttling_array}\n')
63
+ raise ExtractError(f'{curr_func} is not callable.')
64
+
65
+ first_arg = self.throttling_array[int(step[1])]
66
+
67
+ if len(step) == 2:
68
+ curr_func(first_arg)
69
+ elif len(step) == 3:
70
+ second_arg = self.throttling_array[int(step[2])]
71
+ curr_func(first_arg, second_arg)
72
+
73
+ self.calculated_n = ''.join(initial_n)
74
+ return self.calculated_n
75
+
76
+ def get_signature(self, ciphered_signature: str) -> str:
77
+ """Decipher the signature.
78
+
79
+ Taking the ciphered signature, applies the transform functions.
80
+
81
+ :param str ciphered_signature:
82
+ The ciphered signature sent in the ``player_config``.
83
+ :rtype: str
84
+ :returns:
85
+ Decrypted signature required to download the media content.
86
+ """
87
+ signature = list(ciphered_signature)
88
+
89
+ for js_func in self.transform_plan:
90
+ name, argument = self.parse_function(js_func) # type: ignore
91
+ signature = self.transform_map[name](signature, argument)
92
+ logger.debug(
93
+ "applied transform function\n"
94
+ "output: %s\n"
95
+ "js_function: %s\n"
96
+ "argument: %d\n"
97
+ "function: %s",
98
+ "".join(signature),
99
+ name,
100
+ argument,
101
+ self.transform_map[name],
102
+ )
103
+
104
+ return "".join(signature)
105
+
106
+ @cache
107
+ def parse_function(self, js_func: str) -> Tuple[str, int]:
108
+ """Parse the Javascript transform function.
109
+
110
+ Break a JavaScript transform function down into a two element ``tuple``
111
+ containing the function name and some integer-based argument.
112
+
113
+ :param str js_func:
114
+ The JavaScript version of the transform function.
115
+ :rtype: tuple
116
+ :returns:
117
+ two element tuple containing the function name and an argument.
118
+
119
+ **Example**:
120
+
121
+ parse_function('DE.AJ(a,15)')
122
+ ('AJ', 15)
123
+
124
+ """
125
+ logger.debug("parsing transform function")
126
+ for pattern in self.js_func_patterns:
127
+ regex = re.compile(pattern)
128
+ parse_match = regex.search(js_func)
129
+ if parse_match:
130
+ fn_name, fn_arg = parse_match.groups()
131
+ return fn_name, int(fn_arg)
132
+
133
+ raise RegexMatchError(
134
+ caller="parse_function", pattern="js_func_patterns"
135
+ )
136
+
137
+
138
+ def get_initial_function_name(js: str) -> str:
139
+ """Extract the name of the function responsible for computing the signature.
140
+ :param str js:
141
+ The contents of the base.js asset file.
142
+ :rtype: str
143
+ :returns:
144
+ Function name from regex match
145
+ """
146
+
147
+ function_patterns = [
148
+ r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
149
+ r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
150
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
151
+ r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
152
+ r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
153
+ r"\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(",
154
+ r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
155
+ r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
156
+ r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
157
+ r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
158
+ r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
159
+ r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
160
+ ]
161
+ logger.debug("finding initial function name")
162
+ for pattern in function_patterns:
163
+ regex = re.compile(pattern)
164
+ function_match = regex.search(js)
165
+ if function_match:
166
+ logger.debug("finished regex search, matched: %s", pattern)
167
+ return function_match.group(1)
168
+
169
+ raise RegexMatchError(
170
+ caller="get_initial_function_name", pattern="multiple"
171
+ )
172
+
173
+
174
+ def get_transform_plan(js: str) -> List[str]:
175
+ """Extract the "transform plan".
176
+
177
+ The "transform plan" is the functions that the ciphered signature is
178
+ cycled through to obtain the actual signature.
179
+
180
+ :param str js:
181
+ The contents of the base.js asset file.
182
+
183
+ **Example**:
184
+
185
+ ['DE.AJ(a,15)',
186
+ 'DE.VR(a,3)',
187
+ 'DE.AJ(a,51)',
188
+ 'DE.VR(a,3)',
189
+ 'DE.kT(a,51)',
190
+ 'DE.kT(a,8)',
191
+ 'DE.VR(a,3)',
192
+ 'DE.kT(a,21)']
193
+ """
194
+ name = re.escape(get_initial_function_name(js))
195
+ pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name
196
+ logger.debug("getting transform plan")
197
+ return regex_search(pattern, js, group=1).split(";")
198
+
199
+
200
+ def get_transform_object(js: str, var: str) -> List[str]:
201
+ """Extract the "transform object".
202
+
203
+ The "transform object" contains the function definitions referenced in the
204
+ "transform plan". The ``var`` argument is the obfuscated variable name
205
+ which contains these functions, for example, given the function call
206
+ ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var.
207
+
208
+ :param str js:
209
+ The contents of the base.js asset file.
210
+ :param str var:
211
+ The obfuscated variable name that stores an object with all functions
212
+ that descrambles the signature.
213
+
214
+ **Example**:
215
+
216
+ >>> get_transform_object(js, 'DE')
217
+ ['AJ:function(a){a.reverse()}',
218
+ 'VR:function(a,b){a.splice(0,b)}',
219
+ 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}']
220
+
221
+ """
222
+ pattern = r"var %s={(.*?)};" % re.escape(var)
223
+ logger.debug("getting transform object")
224
+ regex = re.compile(pattern, flags=re.DOTALL)
225
+ transform_match = regex.search(js)
226
+ if not transform_match:
227
+ raise RegexMatchError(caller="get_transform_object", pattern=pattern)
228
+
229
+ return transform_match.group(1).replace("\n", " ").split(", ")
230
+
231
+
232
+ def get_transform_map(js: str, var: str) -> Dict:
233
+ """Build a transform function lookup.
234
+
235
+ Build a lookup table of obfuscated JavaScript function names to the
236
+ Python equivalents.
237
+
238
+ :param str js:
239
+ The contents of the base.js asset file.
240
+ :param str var:
241
+ The obfuscated variable name that stores an object with all functions
242
+ that descrambles the signature.
243
+
244
+ """
245
+ transform_object = get_transform_object(js, var)
246
+ mapper = {}
247
+ for obj in transform_object:
248
+ # AJ:function(a){a.reverse()} => AJ, function(a){a.reverse()}
249
+ name, function = obj.split(":", 1)
250
+ fn = map_functions(function)
251
+ mapper[name] = fn
252
+ return mapper
253
+
254
+
255
+ def get_throttling_function_name(js: str) -> str:
256
+ """Extract the name of the function that computes the throttling parameter.
257
+
258
+ :param str js:
259
+ The contents of the base.js asset file.
260
+ :rtype: str
261
+ :returns:
262
+ The name of the function used to compute the throttling parameter.
263
+ """
264
+ function_patterns = [
265
+ # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
266
+ # https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
267
+ # var Bpa = [iha];
268
+ # ...
269
+ # a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
270
+ # Bpa.length || iha("")) }};
271
+ # In the above case, `iha` is the relevant function name
272
+ r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
273
+ r'\([a-z]\s*=\s*([a-zA-Z0-9$]+)(\[\d+\])?\([a-z]\)',
274
+ ]
275
+ logger.debug('Finding throttling function name')
276
+ for pattern in function_patterns:
277
+ regex = re.compile(pattern)
278
+ function_match = regex.search(js)
279
+ if function_match:
280
+ logger.debug("finished regex search, matched: %s", pattern)
281
+ if len(function_match.groups()) == 1:
282
+ return function_match.group(1)
283
+ idx = function_match.group(2)
284
+ if idx:
285
+ idx = idx.strip("[]")
286
+ array = re.search(
287
+ r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
288
+ nfunc=re.escape(function_match.group(1))),
289
+ js
290
+ )
291
+ if array:
292
+ array = array.group(1).strip("[]").split(",")
293
+ array = [x.strip() for x in array]
294
+ return array[int(idx)]
295
+
296
+ raise RegexMatchError(
297
+ caller="get_throttling_function_name", pattern="multiple"
298
+ )
299
+
300
+
301
+ def get_throttling_function_code(js: str) -> str:
302
+ """Extract the raw code for the throttling function.
303
+
304
+ :param str js:
305
+ The contents of the base.js asset file.
306
+ :rtype: str
307
+ :returns:
308
+ The name of the function used to compute the throttling parameter.
309
+ """
310
+ # Begin by extracting the correct function name
311
+ name = re.escape(get_throttling_function_name(js))
312
+
313
+ # Identify where the function is defined
314
+ pattern_start = r"%s=function\(\w\)" % name
315
+ regex = re.compile(pattern_start)
316
+ match = regex.search(js)
317
+
318
+ # Extract the code within curly braces for the function itself, and merge any split lines
319
+ code_lines_list = find_object_from_startpoint(js, match.span()[1]).split('\n')
320
+ joined_lines = "".join(code_lines_list)
321
+
322
+ # Prepend function definition (e.g. `Dea=function(a)`)
323
+ return match.group(0) + joined_lines
324
+
325
+
326
+ def get_throttling_function_array(js: str) -> List[Any]:
327
+ """Extract the "c" array.
328
+
329
+ :param str js:
330
+ The contents of the base.js asset file.
331
+ :returns:
332
+ The array of various integers, arrays, and functions.
333
+ """
334
+ raw_code = get_throttling_function_code(js)
335
+
336
+ array_start = r",c=\["
337
+ array_regex = re.compile(array_start)
338
+ match = array_regex.search(raw_code)
339
+
340
+ array_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
341
+ str_array = throttling_array_split(array_raw)
342
+
343
+ converted_array = []
344
+ for el in str_array:
345
+ try:
346
+ converted_array.append(int(el))
347
+ continue
348
+ except ValueError:
349
+ # Not an integer value.
350
+ pass
351
+
352
+ if el == 'null':
353
+ converted_array.append(None)
354
+ continue
355
+
356
+ if el.startswith('"') and el.endswith('"'):
357
+ # Convert e.g. '"abcdef"' to string without quotation marks, 'abcdef'
358
+ converted_array.append(el[1:-1])
359
+ continue
360
+
361
+ if el.startswith('function'):
362
+ mapper = (
363
+ (r"{for\(\w=\(\w%\w\.length\+\w\.length\)%\w\.length;\w--;\)\w\.unshift\(\w.pop\(\)\)}", throttling_unshift), # noqa:E501
364
+ (r"{\w\.reverse\(\)}", throttling_reverse),
365
+ (r"{\w\.push\(\w\)}", throttling_push),
366
+ (r";var\s\w=\w\[0\];\w\[0\]=\w\[\w\];\w\[\w\]=\w}", throttling_swap),
367
+ (r"case\s\d+", throttling_cipher_function),
368
+ (r"\w\.splice\(0,1,\w\.splice\(\w,1,\w\[0\]\)\[0\]\)", throttling_nested_splice), # noqa:E501
369
+ (r";\w\.splice\(\w,1\)}", js_splice),
370
+ (r"\w\.splice\(-\w\)\.reverse\(\)\.forEach\(function\(\w\){\w\.unshift\(\w\)}\)", throttling_prepend), # noqa:E501
371
+ (r"for\(var \w=\w\.length;\w;\)\w\.push\(\w\.splice\(--\w,1\)\[0\]\)}", throttling_reverse), # noqa:E501
372
+ )
373
+
374
+ found = False
375
+ for pattern, fn in mapper:
376
+ if re.search(pattern, el):
377
+ converted_array.append(fn)
378
+ found = True
379
+ if found:
380
+ continue
381
+
382
+ converted_array.append(el)
383
+
384
+ # Replace null elements with array itself
385
+ for i in range(len(converted_array)):
386
+ if converted_array[i] is None:
387
+ converted_array[i] = converted_array
388
+
389
+ return converted_array
390
+
391
+
392
+ def get_throttling_plan(js: str):
393
+ """Extract the "throttling plan".
394
+
395
+ The "throttling plan" is a list of tuples used for calling functions
396
+ in the c array. The first element of the tuple is the index of the
397
+ function to call, and any remaining elements of the tuple are arguments
398
+ to pass to that function.
399
+
400
+ :param str js:
401
+ The contents of the base.js asset file.
402
+ :returns:
403
+ The full function code for computing the throttlign parameter.
404
+ """
405
+ raw_code = get_throttling_function_code(js)
406
+
407
+ transform_start = r"try{"
408
+ plan_regex = re.compile(transform_start)
409
+ match = plan_regex.search(raw_code)
410
+
411
+ transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
412
+
413
+ # Steps are either c[x](c[y]) or c[x](c[y],c[z])
414
+ step_start = r"c\[(\d+)\]\(c\[(\d+)\](,c(\[(\d+)\]))?\)"
415
+ step_regex = re.compile(step_start)
416
+ matches = step_regex.findall(transform_plan_raw)
417
+ transform_steps = []
418
+ for match in matches:
419
+ if match[4] != '':
420
+ transform_steps.append((match[0],match[1],match[4]))
421
+ else:
422
+ transform_steps.append((match[0],match[1]))
423
+
424
+ return transform_steps
425
+
426
+
427
+ def reverse(arr: List, _: Optional[Any]):
428
+ """Reverse elements in a list.
429
+
430
+ This function is equivalent to:
431
+
432
+ .. code-block:: javascript
433
+
434
+ function(a, b) { a.reverse() }
435
+
436
+ This method takes an unused ``b`` variable as their transform functions
437
+ universally sent two arguments.
438
+
439
+ **Example**:
440
+
441
+ >>> reverse([1, 2, 3, 4])
442
+ [4, 3, 2, 1]
443
+ """
444
+ return arr[::-1]
445
+
446
+
447
+ def splice(arr: List, b: int):
448
+ """Add/remove items to/from a list.
449
+
450
+ This function is equivalent to:
451
+
452
+ .. code-block:: javascript
453
+
454
+ function(a, b) { a.splice(0, b) }
455
+
456
+ **Example**:
457
+
458
+ >>> splice([1, 2, 3, 4], 2)
459
+ [1, 2]
460
+ """
461
+ return arr[b:]
462
+
463
+
464
+ def swap(arr: List, b: int):
465
+ """Swap positions at b modulus the list length.
466
+
467
+ This function is equivalent to:
468
+
469
+ .. code-block:: javascript
470
+
471
+ function(a, b) { var c=a[0];a[0]=a[b%a.length];a[b]=c }
472
+
473
+ **Example**:
474
+
475
+ >>> swap([1, 2, 3, 4], 2)
476
+ [3, 2, 1, 4]
477
+ """
478
+ r = b % len(arr)
479
+ return list(chain([arr[r]], arr[1:r], [arr[0]], arr[r + 1 :]))
480
+
481
+
482
+ def throttling_reverse(arr: list):
483
+ """Reverses the input list.
484
+
485
+ Needs to do an in-place reversal so that the passed list gets changed.
486
+ To accomplish this, we create a reversed copy, and then change each
487
+ indvidual element.
488
+ """
489
+ reverse_copy = arr.copy()[::-1]
490
+ for i in range(len(reverse_copy)):
491
+ arr[i] = reverse_copy[i]
492
+
493
+
494
+ def throttling_push(d: list, e: Any):
495
+ """Pushes an element onto a list."""
496
+ d.append(e)
497
+
498
+
499
+ def throttling_mod_func(d: list, e: int):
500
+ """Perform the modular function from the throttling array functions.
501
+
502
+ In the javascript, the modular operation is as follows:
503
+ e = (e % d.length + d.length) % d.length
504
+
505
+ We simply translate this to python here.
506
+ """
507
+ return (e % len(d) + len(d)) % len(d)
508
+
509
+
510
+ def throttling_unshift(d: list, e: int):
511
+ """Rotates the elements of the list to the right.
512
+
513
+ In the javascript, the operation is as follows:
514
+ for(e=(e%d.length+d.length)%d.length;e--;)d.unshift(d.pop())
515
+ """
516
+ e = throttling_mod_func(d, e)
517
+ new_arr = d[-e:] + d[:-e]
518
+ d.clear()
519
+ for el in new_arr:
520
+ d.append(el)
521
+
522
+
523
+ def throttling_cipher_function(d: list, e: str):
524
+ """This ciphers d with e to generate a new list.
525
+
526
+ In the javascript, the operation is as follows:
527
+ var h = [A-Za-z0-9-_], f = 96; // simplified from switch-case loop
528
+ d.forEach(
529
+ function(l,m,n){
530
+ this.push(
531
+ n[m]=h[
532
+ (h.indexOf(l)-h.indexOf(this[m])+m-32+f--)%h.length
533
+ ]
534
+ )
535
+ },
536
+ e.split("")
537
+ )
538
+ """
539
+ h = list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_')
540
+ f = 96
541
+ # by naming it "this" we can more closely reflect the js
542
+ this = list(e)
543
+
544
+ # This is so we don't run into weirdness with enumerate while
545
+ # we change the input list
546
+ copied_list = d.copy()
547
+
548
+ for m, l in enumerate(copied_list):
549
+ bracket_val = (h.index(l) - h.index(this[m]) + m - 32 + f) % len(h)
550
+ this.append(
551
+ h[bracket_val]
552
+ )
553
+ d[m] = h[bracket_val]
554
+ f -= 1
555
+
556
+
557
+ def throttling_nested_splice(d: list, e: int):
558
+ """Nested splice function in throttling js.
559
+
560
+ In the javascript, the operation is as follows:
561
+ function(d,e){
562
+ e=(e%d.length+d.length)%d.length;
563
+ d.splice(
564
+ 0,
565
+ 1,
566
+ d.splice(
567
+ e,
568
+ 1,
569
+ d[0]
570
+ )[0]
571
+ )
572
+ }
573
+
574
+ While testing, all this seemed to do is swap element 0 and e,
575
+ but the actual process is preserved in case there was an edge
576
+ case that was not considered.
577
+ """
578
+ e = throttling_mod_func(d, e)
579
+ inner_splice = js_splice(
580
+ d,
581
+ e,
582
+ 1,
583
+ d[0]
584
+ )
585
+ js_splice(
586
+ d,
587
+ 0,
588
+ 1,
589
+ inner_splice[0]
590
+ )
591
+
592
+
593
+ def throttling_prepend(d: list, e: int):
594
+ """
595
+
596
+ In the javascript, the operation is as follows:
597
+ function(d,e){
598
+ e=(e%d.length+d.length)%d.length;
599
+ d.splice(-e).reverse().forEach(
600
+ function(f){
601
+ d.unshift(f)
602
+ }
603
+ )
604
+ }
605
+
606
+ Effectively, this moves the last e elements of d to the beginning.
607
+ """
608
+ start_len = len(d)
609
+ # First, calculate e
610
+ e = throttling_mod_func(d, e)
611
+
612
+ # Then do the prepending
613
+ new_arr = d[-e:] + d[:-e]
614
+
615
+ # And update the input list
616
+ d.clear()
617
+ for el in new_arr:
618
+ d.append(el)
619
+
620
+ end_len = len(d)
621
+ assert start_len == end_len
622
+
623
+
624
+ def throttling_swap(d: list, e: int):
625
+ """Swap positions of the 0'th and e'th elements in-place."""
626
+ e = throttling_mod_func(d, e)
627
+ f = d[0]
628
+ d[0] = d[e]
629
+ d[e] = f
630
+
631
+
632
+ def js_splice(arr: list, start: int, delete_count=None, *items):
633
+ """Implementation of javascript's splice function.
634
+
635
+ :param list arr:
636
+ Array to splice
637
+ :param int start:
638
+ Index at which to start changing the array
639
+ :param int delete_count:
640
+ Number of elements to delete from the array
641
+ :param *items:
642
+ Items to add to the array
643
+
644
+ Reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice # noqa:E501
645
+ """
646
+ # Special conditions for start value
647
+ try:
648
+ if start > len(arr):
649
+ start = len(arr)
650
+ # If start is negative, count backwards from end
651
+ if start < 0:
652
+ start = len(arr) - start
653
+ except TypeError:
654
+ # Non-integer start values are treated as 0 in js
655
+ start = 0
656
+
657
+ # Special condition when delete_count is greater than remaining elements
658
+ if not delete_count or delete_count >= len(arr) - start:
659
+ delete_count = len(arr) - start # noqa: N806
660
+
661
+ deleted_elements = arr[start:start + delete_count]
662
+
663
+ # Splice appropriately.
664
+ new_arr = arr[:start] + list(items) + arr[start + delete_count:]
665
+
666
+ # Replace contents of input array
667
+ arr.clear()
668
+ for el in new_arr:
669
+ arr.append(el)
670
+
671
+ return deleted_elements
672
+
673
+
674
+ def map_functions(js_func: str) -> Callable:
675
+ """For a given JavaScript transform function, return the Python equivalent.
676
+
677
+ :param str js_func:
678
+ The JavaScript version of the transform function.
679
+ """
680
+ mapper = (
681
+ # function(a){a.reverse()}
682
+ (r"{\w\.reverse\(\)}", reverse),
683
+ # function(a,b){a.splice(0,b)}
684
+ (r"{\w\.splice\(0,\w\)}", splice),
685
+ # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}
686
+ (r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\]=\w}", swap),
687
+ # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c}
688
+ (
689
+ r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\%\w.length\]=\w}",
690
+ swap,
691
+ ),
692
+ )
693
+
694
+ for pattern, fn in mapper:
695
+ if re.search(pattern, js_func):
696
+ return fn
697
+ raise RegexMatchError(caller="map_functions", pattern="multiple")
pytube/cli.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """A simple command line application to download youtube videos."""
3
+ import argparse
4
+ import gzip
5
+ import json
6
+ import logging
7
+ import os
8
+ import shutil
9
+ import sys
10
+ import datetime as dt
11
+ import subprocess # nosec
12
+ from typing import List, Optional
13
+
14
+ import pytube.exceptions as exceptions
15
+ from pytube import __version__
16
+ from pytube import CaptionQuery, Playlist, Stream, YouTube
17
+ from pytube.helpers import safe_filename, setup_logger
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def main():
24
+ """Command line application to download youtube videos."""
25
+ # noinspection PyTypeChecker
26
+ parser = argparse.ArgumentParser(description=main.__doc__)
27
+ args = _parse_args(parser)
28
+ if args.verbose:
29
+ log_filename = None
30
+ if args.logfile:
31
+ log_filename = args.logfile
32
+ setup_logger(logging.DEBUG, log_filename=log_filename)
33
+ logger.debug(f'Pytube version: {__version__}')
34
+
35
+ if not args.url or "youtu" not in args.url:
36
+ parser.print_help()
37
+ sys.exit(1)
38
+
39
+ if "/playlist" in args.url:
40
+ print("Loading playlist...")
41
+ playlist = Playlist(args.url)
42
+ if not args.target:
43
+ args.target = safe_filename(playlist.title)
44
+ for youtube_video in playlist.videos:
45
+ try:
46
+ _perform_args_on_youtube(youtube_video, args)
47
+ except exceptions.PytubeError as e:
48
+ print(f"There was an error with video: {youtube_video}")
49
+ print(e)
50
+ else:
51
+ print("Loading video...")
52
+ youtube = YouTube(args.url)
53
+ _perform_args_on_youtube(youtube, args)
54
+
55
+
56
+ def _perform_args_on_youtube(
57
+ youtube: YouTube, args: argparse.Namespace
58
+ ) -> None:
59
+ if len(sys.argv) == 2 : # no arguments parsed
60
+ download_highest_resolution_progressive(
61
+ youtube=youtube, resolution="highest", target=args.target
62
+ )
63
+ if args.list_captions:
64
+ _print_available_captions(youtube.captions)
65
+ if args.list:
66
+ display_streams(youtube)
67
+ if args.build_playback_report:
68
+ build_playback_report(youtube)
69
+ if args.itag:
70
+ download_by_itag(youtube=youtube, itag=args.itag, target=args.target)
71
+ if args.caption_code:
72
+ download_caption(
73
+ youtube=youtube, lang_code=args.caption_code, target=args.target
74
+ )
75
+ if args.resolution:
76
+ download_by_resolution(
77
+ youtube=youtube, resolution=args.resolution, target=args.target
78
+ )
79
+ if args.audio:
80
+ download_audio(
81
+ youtube=youtube, filetype=args.audio, target=args.target
82
+ )
83
+ if args.ffmpeg:
84
+ ffmpeg_process(
85
+ youtube=youtube, resolution=args.ffmpeg, target=args.target
86
+ )
87
+
88
+
89
+ def _parse_args(
90
+ parser: argparse.ArgumentParser, args: Optional[List] = None
91
+ ) -> argparse.Namespace:
92
+ parser.add_argument(
93
+ "url", help="The YouTube /watch or /playlist url", nargs="?"
94
+ )
95
+ parser.add_argument(
96
+ "--version", action="version", version="%(prog)s " + __version__,
97
+ )
98
+ parser.add_argument(
99
+ "--itag", type=int, help="The itag for the desired stream",
100
+ )
101
+ parser.add_argument(
102
+ "-r",
103
+ "--resolution",
104
+ type=str,
105
+ help="The resolution for the desired stream",
106
+ )
107
+ parser.add_argument(
108
+ "-l",
109
+ "--list",
110
+ action="store_true",
111
+ help=(
112
+ "The list option causes pytube cli to return a list of streams "
113
+ "available to download"
114
+ ),
115
+ )
116
+ parser.add_argument(
117
+ "-v",
118
+ "--verbose",
119
+ action="store_true",
120
+ dest="verbose",
121
+ help="Set logger output to verbose output.",
122
+ )
123
+ parser.add_argument(
124
+ "--logfile",
125
+ action="store",
126
+ help="logging debug and error messages into a log file",
127
+ )
128
+ parser.add_argument(
129
+ "--build-playback-report",
130
+ action="store_true",
131
+ help="Save the html and js to disk",
132
+ )
133
+ parser.add_argument(
134
+ "-c",
135
+ "--caption-code",
136
+ type=str,
137
+ help=(
138
+ "Download srt captions for given language code. "
139
+ "Prints available language codes if no argument given"
140
+ ),
141
+ )
142
+ parser.add_argument(
143
+ '-lc',
144
+ '--list-captions',
145
+ action='store_true',
146
+ help=(
147
+ "List available caption codes for a video"
148
+ )
149
+ )
150
+ parser.add_argument(
151
+ "-t",
152
+ "--target",
153
+ help=(
154
+ "The output directory for the downloaded stream. "
155
+ "Default is current working directory"
156
+ ),
157
+ )
158
+ parser.add_argument(
159
+ "-a",
160
+ "--audio",
161
+ const="mp4",
162
+ nargs="?",
163
+ help=(
164
+ "Download the audio for a given URL at the highest bitrate available"
165
+ "Defaults to mp4 format if none is specified"
166
+ ),
167
+ )
168
+ parser.add_argument(
169
+ "-f",
170
+ "--ffmpeg",
171
+ const="best",
172
+ nargs="?",
173
+ help=(
174
+ "Downloads the audio and video stream for resolution provided"
175
+ "If no resolution is provided, downloads the best resolution"
176
+ "Runs the command line program ffmpeg to combine the audio and video"
177
+ ),
178
+ )
179
+
180
+ return parser.parse_args(args)
181
+
182
+
183
+ def build_playback_report(youtube: YouTube) -> None:
184
+ """Serialize the request data to json for offline debugging.
185
+
186
+ :param YouTube youtube:
187
+ A YouTube object.
188
+ """
189
+ ts = int(dt.datetime.utcnow().timestamp())
190
+ fp = os.path.join(os.getcwd(), f"yt-video-{youtube.video_id}-{ts}.json.gz")
191
+
192
+ js = youtube.js
193
+ watch_html = youtube.watch_html
194
+ vid_info = youtube.vid_info
195
+
196
+ with gzip.open(fp, "wb") as fh:
197
+ fh.write(
198
+ json.dumps(
199
+ {
200
+ "url": youtube.watch_url,
201
+ "js": js,
202
+ "watch_html": watch_html,
203
+ "video_info": vid_info,
204
+ }
205
+ ).encode("utf8"),
206
+ )
207
+
208
+
209
+ def display_progress_bar(
210
+ bytes_received: int, filesize: int, ch: str = "█", scale: float = 0.55
211
+ ) -> None:
212
+ """Display a simple, pretty progress bar.
213
+
214
+ Example:
215
+ ~~~~~~~~
216
+ PSY - GANGNAM STYLE(강남스타일) MV.mp4
217
+ ↳ |███████████████████████████████████████| 100.0%
218
+
219
+ :param int bytes_received:
220
+ The delta between the total file size (bytes) and bytes already
221
+ written to disk.
222
+ :param int filesize:
223
+ File size of the media stream in bytes.
224
+ :param str ch:
225
+ Character to use for presenting progress segment.
226
+ :param float scale:
227
+ Scale multiplier to reduce progress bar size.
228
+
229
+ """
230
+ columns = shutil.get_terminal_size().columns
231
+ max_width = int(columns * scale)
232
+
233
+ filled = int(round(max_width * bytes_received / float(filesize)))
234
+ remaining = max_width - filled
235
+ progress_bar = ch * filled + " " * remaining
236
+ percent = round(100.0 * bytes_received / float(filesize), 1)
237
+ text = f" ↳ |{progress_bar}| {percent}%\r"
238
+ sys.stdout.write(text)
239
+ sys.stdout.flush()
240
+
241
+
242
+ # noinspection PyUnusedLocal
243
+ def on_progress(
244
+ stream: Stream, chunk: bytes, bytes_remaining: int
245
+ ) -> None: # pylint: disable=W0613
246
+ filesize = stream.filesize
247
+ bytes_received = filesize - bytes_remaining
248
+ display_progress_bar(bytes_received, filesize)
249
+
250
+
251
+ def _download(
252
+ stream: Stream,
253
+ target: Optional[str] = None,
254
+ filename: Optional[str] = None,
255
+ ) -> None:
256
+ filesize_megabytes = stream.filesize // 1048576
257
+ print(f"{filename or stream.default_filename} | {filesize_megabytes} MB")
258
+ file_path = stream.get_file_path(filename=filename, output_path=target)
259
+ if stream.exists_at_path(file_path):
260
+ print(f"Already downloaded at:\n{file_path}")
261
+ return
262
+
263
+ stream.download(output_path=target, filename=filename)
264
+ sys.stdout.write("\n")
265
+
266
+
267
+ def _unique_name(base: str, subtype: str, media_type: str, target: str) -> str:
268
+ """
269
+ Given a base name, the file format, and the target directory, will generate
270
+ a filename unique for that directory and file format.
271
+ :param str base:
272
+ The given base-name.
273
+ :param str subtype:
274
+ The filetype of the video which will be downloaded.
275
+ :param str media_type:
276
+ The media_type of the file, ie. "audio" or "video"
277
+ :param Path target:
278
+ Target directory for download.
279
+ """
280
+ counter = 0
281
+ while True:
282
+ file_name = f"{base}_{media_type}_{counter}"
283
+ file_path = os.path.join(target, f"{file_name}.{subtype}")
284
+ if not os.path.exists(file_path):
285
+ return file_name
286
+ counter += 1
287
+
288
+
289
+ def ffmpeg_process(
290
+ youtube: YouTube, resolution: str, target: Optional[str] = None
291
+ ) -> None:
292
+ """
293
+ Decides the correct video stream to download, then calls _ffmpeg_downloader.
294
+
295
+ :param YouTube youtube:
296
+ A valid YouTube object.
297
+ :param str resolution:
298
+ YouTube video resolution.
299
+ :param str target:
300
+ Target directory for download
301
+ """
302
+ youtube.register_on_progress_callback(on_progress)
303
+ target = target or os.getcwd()
304
+
305
+ if resolution == "best":
306
+ highest_quality_stream = (
307
+ youtube.streams.filter(progressive=False)
308
+ .order_by("resolution")
309
+ .last()
310
+ )
311
+ mp4_stream = (
312
+ youtube.streams.filter(progressive=False, subtype="mp4")
313
+ .order_by("resolution")
314
+ .last()
315
+ )
316
+ if highest_quality_stream.resolution == mp4_stream.resolution:
317
+ video_stream = mp4_stream
318
+ else:
319
+ video_stream = highest_quality_stream
320
+ else:
321
+ video_stream = youtube.streams.filter(
322
+ progressive=False, resolution=resolution, subtype="mp4"
323
+ ).first()
324
+ if not video_stream:
325
+ video_stream = youtube.streams.filter(
326
+ progressive=False, resolution=resolution
327
+ ).first()
328
+ if video_stream is None:
329
+ print(f"Could not find a stream with resolution: {resolution}")
330
+ print("Try one of these:")
331
+ display_streams(youtube)
332
+ sys.exit()
333
+
334
+ audio_stream = youtube.streams.get_audio_only(video_stream.subtype)
335
+ if not audio_stream:
336
+ audio_stream = (
337
+ youtube.streams.filter(only_audio=True).order_by("abr").last()
338
+ )
339
+ if not audio_stream:
340
+ print("Could not find an audio only stream")
341
+ sys.exit()
342
+ _ffmpeg_downloader(
343
+ audio_stream=audio_stream, video_stream=video_stream, target=target
344
+ )
345
+
346
+
347
+ def _ffmpeg_downloader(
348
+ audio_stream: Stream, video_stream: Stream, target: str
349
+ ) -> None:
350
+ """
351
+ Given a YouTube Stream object, finds the correct audio stream, downloads them both
352
+ giving them a unique name, them uses ffmpeg to create a new file with the audio
353
+ and video from the previously downloaded files. Then deletes the original adaptive
354
+ streams, leaving the combination.
355
+
356
+ :param Stream audio_stream:
357
+ A valid Stream object representing the audio to download
358
+ :param Stream video_stream:
359
+ A valid Stream object representing the video to download
360
+ :param Path target:
361
+ A valid Path object
362
+ """
363
+ video_unique_name = _unique_name(
364
+ safe_filename(video_stream.title),
365
+ video_stream.subtype,
366
+ "video",
367
+ target=target,
368
+ )
369
+ audio_unique_name = _unique_name(
370
+ safe_filename(video_stream.title),
371
+ audio_stream.subtype,
372
+ "audio",
373
+ target=target,
374
+ )
375
+ _download(stream=video_stream, target=target, filename=video_unique_name)
376
+ print("Loading audio...")
377
+ _download(stream=audio_stream, target=target, filename=audio_unique_name)
378
+
379
+ video_path = os.path.join(
380
+ target, f"{video_unique_name}.{video_stream.subtype}"
381
+ )
382
+ audio_path = os.path.join(
383
+ target, f"{audio_unique_name}.{audio_stream.subtype}"
384
+ )
385
+ final_path = os.path.join(
386
+ target, f"{safe_filename(video_stream.title)}.{video_stream.subtype}"
387
+ )
388
+
389
+ subprocess.run( # nosec
390
+ [
391
+ "ffmpeg",
392
+ "-i",
393
+ video_path,
394
+ "-i",
395
+ audio_path,
396
+ "-codec",
397
+ "copy",
398
+ final_path,
399
+ ]
400
+ )
401
+ os.unlink(video_path)
402
+ os.unlink(audio_path)
403
+
404
+
405
+ def download_by_itag(
406
+ youtube: YouTube, itag: int, target: Optional[str] = None
407
+ ) -> None:
408
+ """Start downloading a YouTube video.
409
+
410
+ :param YouTube youtube:
411
+ A valid YouTube object.
412
+ :param int itag:
413
+ YouTube format identifier code.
414
+ :param str target:
415
+ Target directory for download
416
+ """
417
+ stream = youtube.streams.get_by_itag(itag)
418
+ if stream is None:
419
+ print(f"Could not find a stream with itag: {itag}")
420
+ print("Try one of these:")
421
+ display_streams(youtube)
422
+ sys.exit()
423
+
424
+ youtube.register_on_progress_callback(on_progress)
425
+
426
+ try:
427
+ _download(stream, target=target)
428
+ except KeyboardInterrupt:
429
+ sys.exit()
430
+
431
+
432
+ def download_by_resolution(
433
+ youtube: YouTube, resolution: str, target: Optional[str] = None
434
+ ) -> None:
435
+ """Start downloading a YouTube video.
436
+
437
+ :param YouTube youtube:
438
+ A valid YouTube object.
439
+ :param str resolution:
440
+ YouTube video resolution.
441
+ :param str target:
442
+ Target directory for download
443
+ """
444
+ # TODO(nficano): allow dash itags to be selected
445
+ stream = youtube.streams.get_by_resolution(resolution)
446
+ if stream is None:
447
+ print(f"Could not find a stream with resolution: {resolution}")
448
+ print("Try one of these:")
449
+ display_streams(youtube)
450
+ sys.exit()
451
+
452
+ youtube.register_on_progress_callback(on_progress)
453
+
454
+ try:
455
+ _download(stream, target=target)
456
+ except KeyboardInterrupt:
457
+ sys.exit()
458
+
459
+
460
+ def download_highest_resolution_progressive(
461
+ youtube: YouTube, resolution: str, target: Optional[str] = None
462
+ ) -> None:
463
+ """Start downloading the highest resolution progressive stream.
464
+
465
+ :param YouTube youtube:
466
+ A valid YouTube object.
467
+ :param str resolution:
468
+ YouTube video resolution.
469
+ :param str target:
470
+ Target directory for download
471
+ """
472
+ youtube.register_on_progress_callback(on_progress)
473
+ try:
474
+ stream = youtube.streams.get_highest_resolution()
475
+ except exceptions.VideoUnavailable as err:
476
+ print(f"No video streams available: {err}")
477
+ else:
478
+ try:
479
+ _download(stream, target=target)
480
+ except KeyboardInterrupt:
481
+ sys.exit()
482
+
483
+
484
+ def display_streams(youtube: YouTube) -> None:
485
+ """Probe YouTube video and lists its available formats.
486
+
487
+ :param YouTube youtube:
488
+ A valid YouTube watch URL.
489
+
490
+ """
491
+ for stream in youtube.streams:
492
+ print(stream)
493
+
494
+
495
+ def _print_available_captions(captions: CaptionQuery) -> None:
496
+ print(
497
+ f"Available caption codes are: {', '.join(c.code for c in captions)}"
498
+ )
499
+
500
+
501
+ def download_caption(
502
+ youtube: YouTube, lang_code: Optional[str], target: Optional[str] = None
503
+ ) -> None:
504
+ """Download a caption for the YouTube video.
505
+
506
+ :param YouTube youtube:
507
+ A valid YouTube object.
508
+ :param str lang_code:
509
+ Language code desired for caption file.
510
+ Prints available codes if the value is None
511
+ or the desired code is not available.
512
+ :param str target:
513
+ Target directory for download
514
+ """
515
+ try:
516
+ caption = youtube.captions[lang_code]
517
+ downloaded_path = caption.download(
518
+ title=youtube.title, output_path=target
519
+ )
520
+ print(f"Saved caption file to: {downloaded_path}")
521
+ except KeyError:
522
+ print(f"Unable to find caption with code: {lang_code}")
523
+ _print_available_captions(youtube.captions)
524
+
525
+
526
+ def download_audio(
527
+ youtube: YouTube, filetype: str, target: Optional[str] = None
528
+ ) -> None:
529
+ """
530
+ Given a filetype, downloads the highest quality available audio stream for a
531
+ YouTube video.
532
+
533
+ :param YouTube youtube:
534
+ A valid YouTube object.
535
+ :param str filetype:
536
+ Desired file format to download.
537
+ :param str target:
538
+ Target directory for download
539
+ """
540
+ audio = (
541
+ youtube.streams.filter(only_audio=True, subtype=filetype)
542
+ .order_by("abr")
543
+ .last()
544
+ )
545
+
546
+ if audio is None:
547
+ print("No audio only stream found. Try one of these:")
548
+ display_streams(youtube)
549
+ sys.exit()
550
+
551
+ youtube.register_on_progress_callback(on_progress)
552
+
553
+ try:
554
+ _download(audio, target=target)
555
+ except KeyboardInterrupt:
556
+ sys.exit()
557
+
558
+
559
+ if __name__ == "__main__":
560
+ main()
pytube/contrib/__init__.py ADDED
File without changes
pytube/contrib/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (150 Bytes). View file
 
pytube/contrib/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (150 Bytes). View file