diff --git a/app.py b/app.py
index bf46e6a36653108c1bfbaaf483eb8bd14b952885..0c09042a7935d06bd1b2081b68f24668f252f25f 100644
--- a/app.py
+++ b/app.py
@@ -1,55 +1,51 @@
-# main.py
-
-from fastapi import FastAPI
+import gradio as gr
from PIL import Image
-import base64
-from fastapi.responses import HTMLResponse, FileResponse
-
-app = FastAPI()
-
-
-@app.get("/")
-async def root():
- return FileResponse(path="static/index.html", media_type="text/html")
-
-@app.get("/html")
-async def root():
- """Basic HTML response."""
- body = (
- ""
- "
"
- "Welcome to the API
"
- ""
- "Check the docs:
here"
- "
"
- ""
- ""
- )
-
- return HTMLResponse(content=body)
-
-@app.get("/api")
-async def cal_api():
+import os
+import summarizer as su
+import nltk
+
+
+def image_mod(rpunkt_switch, link):
+
+ if len(link)==0:
+ return 'Error: No link provided', None
+
+ nltk_file = 'nltk_data/tokenizers/punkt.zip'
+ home_pc = '/Users/hujo/'
+ home_hf = '/home/user/'
+ if os.path.exists(home_pc+nltk_file) or os.path.exists(home_hf+nltk_file):
+ print('nltk punkt file exists in ', nltk_file)
+ else:
+ nltk.download('punkt')
+
+ #link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE'
+ lexrank_switch = True
+ html = ''
images = []
+ html, images = su.getSummary(link, lexrank_switch, rpunkt_switch)
+ #images = su.getSummaryImage(link, lexrank_switch, rpunkt_switch)
+ print(html)
+
+ files = os.listdir('workdir/')
+ print('local files: ',files)
- with open('workdir/lion.jpg', 'rb') as open_file:
- byte_content = open_file.read()
- base64_bytes = base64.b64encode(byte_content)
- base64_string = base64_bytes.decode('utf-8')
- images.append(base64_string)
+ #image_path = 'workdir/lion.jpg'
+ #im = Image.open(image_path)
+ #images.append(im)
+ #with Image.open(open(image_path,'rb')) as im:
+ # images.append(im)
+ #images.append(im.rotate(90))
+
+ #images[0].save("newlion.png")
+
+ print('images',images)
- with open('workdir/cheetah.jpg', 'rb') as open_file:
- byte_content = open_file.read()
- base64_bytes = base64.b64encode(byte_content)
- base64_string = base64_bytes.decode('utf-8')
- images.append(base64_string)
+ return html, images
- #image_path='lion.jpg'
- #pilim = Image.open(image_path)
- #pilimrot = pilim.rotate(45)
- return {"data": images}
-@app.get("/items/{item_id}")
-async def read_item(item_id):
- return {"item_id": item_id}
+demo = gr.Interface(image_mod,
+ [gr.Checkbox(label='Restore runctuation'), "text"] , ["html", gr.Gallery()],
+ allow_flagging="never")
+if __name__ == "__main__":
+ demo.launch()
diff --git a/frames.py b/frames.py
new file mode 100644
index 0000000000000000000000000000000000000000..043585fdb6a24ebb279d826f36adab02cf402a45
--- /dev/null
+++ b/frames.py
@@ -0,0 +1,102 @@
+from ast import Try
+import subprocess as sp
+import os
+
+# show current venv: echo $VIRTUAL_ENV
+# import sys
+# del sys.modules['frames']
+
+# transcript module
+# 1. extract timestamps from transcript
+# 2. extract captions from transcript
+# this module
+# 3. extract frames at timestamps
+# 4. add caption to each frame
+# 5. convert images to mp4 video
+
+# converts a list of images to a mp4 video
+def convertImageToVideo():
+ cmd = "ffmpeg -y -f image2 -i frame_%04d.jpg output_video.mp4"
+ cmd_call = cmd.split()
+ working_dir = './workdir'
+
+ with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
+ result = proc.stderr.read()
+
+ return [proc.wait(),result]
+
+
+# extract a frame as jpg image file
+# from a video at a given timestamp
+# num=0; for p in $(cat timestamps); do ((num++)); printf "$num $p\r"; dnum=$(printf "%03d" "$num"); ffmpeg -ss $p -i "$mp4file" -frames:v 1 out_$dnum.jpg >& ffmpeg.out; done
+def extractImagesFromVideo(timestamps):
+ working_dir = './workdir'
+ input_file = 'input_video.mp4'
+ if not os.path.isfile(working_dir+'/'+input_file):
+ return 'Error: File '+input_file+' is missing, create the file first.'
+
+
+ # create a working directory for the files
+ if not os.path.isdir(working_dir):
+ print('There is no working directory. Create a new one.')
+ os.mkdir(working_dir)
+
+ proc_list = []
+ for current_frame, current_timestamp in enumerate(timestamps, start=1):
+ print(f"{current_frame:04d}", current_timestamp)
+ cmd = 'ffmpeg -y -ss '+str(current_timestamp)+' -i '+input_file+' -frames:v 1 frame_'+f"{current_frame:04d}"+'.jpg'
+ cmd_call = cmd.split()
+
+ with sp.Popen(cmd_call,cwd=working_dir, stderr=sp.PIPE) as proc:
+ proc_list.append(proc.wait())
+
+ return proc_list
+
+# add caption to each image
+# 'convert' porgram is from the 'imagemagick' package
+# num=0; while read p; do ((num++)); dnum=$(printf "%03d" "$num"); printf "$dnum $p\r"; convert out_$dnum.jpg -undercolor Black -fill white -gravity South -pointsize 25 -annotate +0+10 "$p" out_$dnum.jpg >& ffmpeg.out; done 10:
+ lang = detect(text)
+ if lang != 'en':
+ raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
+ If you are certain the input is English, pass argument lang='en' to this function.
+ Punctuate received: {text}""")
+
+ # plit up large text into bert digestable chunks
+ splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
+ # predict slices
+ # full_preds_lst contains tuple of labels and logits
+ full_preds_lst = [self.predict(i['text']) for i in splits]
+ # extract predictions, and discard logits
+ preds_lst = [i[0][0] for i in full_preds_lst]
+ # join text slices
+ combined_preds = self.combine_results(text, preds_lst)
+ # create punctuated prediction
+ punct_text = self.punctuate_texts(combined_preds)
+ return punct_text
+
+ def predict(self, input_slice):
+ """
+ Passes the unpunctuated text to the model for punctuation.
+ """
+ predictions, raw_outputs = self.model.predict([input_slice])
+ return predictions, raw_outputs
+
+ @staticmethod
+ def split_on_toks(text, length, overlap):
+ """
+ Splits text into predefined slices of overlapping text with indexes (offsets)
+ that tie-back to original text.
+ This is done to bypass 512 token limit on transformer models by sequentially
+ feeding chunks of < 512 toks.
+ Example output:
+ [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
+ """
+ wrds = text.replace('\n', ' ').split(" ")
+ resp = []
+ lst_chunk_idx = 0
+ i = 0
+
+ while True:
+ # words in the chunk and the overlapping portion
+ wrds_len = wrds[(length * i):(length * (i + 1))]
+ wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
+ wrds_split = wrds_len + wrds_ovlp
+
+ # Break loop if no more words
+ if not wrds_split:
+ break
+
+ wrds_str = " ".join(wrds_split)
+ nxt_chunk_start_idx = len(" ".join(wrds_len))
+ lst_char_idx = len(" ".join(wrds_split))
+
+ resp_obj = {
+ "text": wrds_str,
+ "start_idx": lst_chunk_idx,
+ "end_idx": lst_char_idx + lst_chunk_idx,
+ }
+
+ resp.append(resp_obj)
+ lst_chunk_idx += nxt_chunk_start_idx + 1
+ i += 1
+ logging.info(f"Sliced transcript into {len(resp)} slices.")
+ return resp
+
+ @staticmethod
+ def combine_results(full_text: str, text_slices):
+ """
+ Given a full text and predictions of each slice combines predictions into a single text again.
+ Performs validataion wether text was combined correctly
+ """
+ split_full_text = full_text.replace('\n', ' ').split(" ")
+ split_full_text = [i for i in split_full_text if i]
+ split_full_text_len = len(split_full_text)
+ output_text = []
+ index = 0
+
+ if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
+ text_slices = text_slices[:-1]
+
+ for _slice in text_slices:
+ slice_wrds = len(_slice)
+ for ix, wrd in enumerate(_slice):
+ # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
+ if index == split_full_text_len:
+ break
+
+ if split_full_text[index] == str(list(wrd.keys())[0]) and \
+ ix <= slice_wrds - 3 and text_slices[-1] != _slice:
+ index += 1
+ pred_item_tuple = list(wrd.items())[0]
+ output_text.append(pred_item_tuple)
+ elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
+ index += 1
+ pred_item_tuple = list(wrd.items())[0]
+ output_text.append(pred_item_tuple)
+ assert [i[0] for i in output_text] == split_full_text
+ return output_text
+
+ @staticmethod
+ def punctuate_texts(full_pred: list):
+ """
+ Given a list of Predictions from the model, applies the predictions to text,
+ thus punctuating it.
+ """
+ punct_resp = ""
+ for i in full_pred:
+ word, label = i
+ if label[-1] == "U":
+ punct_wrd = word.capitalize()
+ else:
+ punct_wrd = word
+
+ if label[0] != "O":
+ punct_wrd += label[0]
+
+ punct_resp += punct_wrd + " "
+ punct_resp = punct_resp.strip()
+ # Append trailing period if doesnt exist.
+ if punct_resp[-1].isalnum():
+ punct_resp += "."
+ return punct_resp
+
+
+if __name__ == "__main__":
+ punct_model = RestorePuncts()
+ # read test file
+ with open('../tests/sample_text.txt', 'r') as fp:
+ test_sample = fp.read()
+ # predict text and print
+ punctuated = punct_model.punctuate(test_sample)
+ print(punctuated)
diff --git a/myrpunct/utils.py b/myrpunct/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..77e88f9bfbded47ca0929abf5dc5686e49d674ea
--- /dev/null
+++ b/myrpunct/utils.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# 💾⚙️🔮
+
+__author__ = "Daulet N."
+__email__ = "daulet.nurmanbetov@gmail.com"
+
+def prepare_unpunct_text(text):
+ """
+ Given a text, normalizes it to subsequently restore punctuation
+ """
+ formatted_txt = text.replace('\n', '').strip()
+ formatted_txt = formatted_txt.lower()
+ formatted_txt_lst = formatted_txt.split(" ")
+ punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
+ normalized_txt = " ".join([i for i in punct_strp_txt if i])
+ return normalized_txt
+
+def strip_punct(wrd):
+ """
+ Given a word, strips non aphanumeric characters that precede and follow it
+ """
+ if not wrd:
+ return wrd
+
+ while not wrd[-1:].isalnum():
+ if not wrd:
+ break
+ wrd = wrd[:-1]
+
+ while not wrd[:1].isalnum():
+ if not wrd:
+ break
+ wrd = wrd[1:]
+ return wrd
diff --git a/pytube/__init__.py b/pytube/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eaa1b2136cdeca46724e46d542a764707c41532
--- /dev/null
+++ b/pytube/__init__.py
@@ -0,0 +1,19 @@
+# flake8: noqa: F401
+# noreorder
+"""
+Pytube: a very serious Python library for downloading YouTube Videos.
+"""
+__title__ = "pytube"
+__author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano"
+__license__ = "The Unlicense (Unlicense)"
+__js__ = None
+__js_url__ = None
+
+from pytube.version import __version__
+from pytube.streams import Stream
+from pytube.captions import Caption
+from pytube.query import CaptionQuery, StreamQuery
+from pytube.__main__ import YouTube
+from pytube.contrib.playlist import Playlist
+from pytube.contrib.channel import Channel
+from pytube.contrib.search import Search
diff --git a/pytube/__main__.py b/pytube/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30e98e29104b37dc7ab37a03590641ea7ca378ff
--- /dev/null
+++ b/pytube/__main__.py
@@ -0,0 +1,467 @@
+"""
+This module implements the core developer interface for pytube.
+
+The problem domain of the :class:`YouTube class focuses almost
+exclusively on the developer interface. Pytube offloads the heavy lifting to
+smaller peripheral modules and functions.
+
+"""
+import logging
+from typing import Any, Callable, Dict, List, Optional
+
+import pytube
+import pytube.exceptions as exceptions
+from pytube import extract, request
+from pytube import Stream, StreamQuery
+from pytube.helpers import install_proxy
+from pytube.innertube import InnerTube
+from pytube.metadata import YouTubeMetadata
+from pytube.monostate import Monostate
+
+logger = logging.getLogger(__name__)
+
+
+class YouTube:
+ """Core developer interface for pytube."""
+
+ def __init__(
+ self,
+ url: str,
+ on_progress_callback: Optional[Callable[[Any, bytes, int], None]] = None,
+ on_complete_callback: Optional[Callable[[Any, Optional[str]], None]] = None,
+ proxies: Dict[str, str] = None,
+ use_oauth: bool = False,
+ allow_oauth_cache: bool = True
+ ):
+ """Construct a :class:`YouTube `.
+
+ :param str url:
+ A valid YouTube watch URL.
+ :param func on_progress_callback:
+ (Optional) User defined callback function for stream download
+ progress events.
+ :param func on_complete_callback:
+ (Optional) User defined callback function for stream download
+ complete events.
+ :param dict proxies:
+ (Optional) A dict mapping protocol to proxy address which will be used by pytube.
+ :param bool use_oauth:
+ (Optional) Prompt the user to authenticate to YouTube.
+ If allow_oauth_cache is set to True, the user should only be prompted once.
+ :param bool allow_oauth_cache:
+ (Optional) Cache OAuth tokens locally on the machine. Defaults to True.
+ These tokens are only generated if use_oauth is set to True as well.
+ """
+ self._js: Optional[str] = None # js fetched by js_url
+ self._js_url: Optional[str] = None # the url to the js, parsed from watch html
+
+ self._vid_info: Optional[Dict] = None # content fetched from innertube/player
+
+ self._watch_html: Optional[str] = None # the html of /watch?v=
+ self._embed_html: Optional[str] = None
+ self._player_config_args: Optional[Dict] = None # inline js in the html containing
+ self._age_restricted: Optional[bool] = None
+
+ self._fmt_streams: Optional[List[Stream]] = None
+
+ self._initial_data = None
+ self._metadata: Optional[YouTubeMetadata] = None
+
+ # video_id part of /watch?v=
+ self.video_id = extract.video_id(url)
+
+ self.watch_url = f"https://youtube.com/watch?v={self.video_id}"
+ self.embed_url = f"https://www.youtube.com/embed/{self.video_id}"
+
+ # Shared between all instances of `Stream` (Borg pattern).
+ self.stream_monostate = Monostate(
+ on_progress=on_progress_callback, on_complete=on_complete_callback
+ )
+
+ if proxies:
+ install_proxy(proxies)
+
+ self._author = None
+ self._title = None
+ self._publish_date = None
+
+ self.use_oauth = use_oauth
+ self.allow_oauth_cache = allow_oauth_cache
+
+ def __repr__(self):
+ return f''
+
+ def __eq__(self, o: object) -> bool:
+ # Compare types and urls, if they're same return true, else return false.
+ return type(o) == type(self) and o.watch_url == self.watch_url
+
+ @property
+ def watch_html(self):
+ if self._watch_html:
+ return self._watch_html
+ self._watch_html = request.get(url=self.watch_url)
+ return self._watch_html
+
+ @property
+ def embed_html(self):
+ if self._embed_html:
+ return self._embed_html
+ self._embed_html = request.get(url=self.embed_url)
+ return self._embed_html
+
+ @property
+ def age_restricted(self):
+ if self._age_restricted:
+ return self._age_restricted
+ self._age_restricted = extract.is_age_restricted(self.watch_html)
+ return self._age_restricted
+
+ @property
+ def js_url(self):
+ if self._js_url:
+ return self._js_url
+
+ if self.age_restricted:
+ self._js_url = extract.js_url(self.embed_html)
+ else:
+ self._js_url = extract.js_url(self.watch_html)
+
+ return self._js_url
+
+ @property
+ def js(self):
+ if self._js:
+ return self._js
+
+ # If the js_url doesn't match the cached url, fetch the new js and update
+ # the cache; otherwise, load the cache.
+ if pytube.__js_url__ != self.js_url:
+ self._js = request.get(self.js_url)
+ pytube.__js__ = self._js
+ pytube.__js_url__ = self.js_url
+ else:
+ self._js = pytube.__js__
+
+ return self._js
+
+ @property
+ def initial_data(self):
+ if self._initial_data:
+ return self._initial_data
+ self._initial_data = extract.initial_data(self.watch_html)
+ return self._initial_data
+
+ @property
+ def streaming_data(self):
+ """Return streamingData from video info."""
+ if 'streamingData' in self.vid_info:
+ return self.vid_info['streamingData']
+ else:
+ self.bypass_age_gate()
+ return self.vid_info['streamingData']
+
+ @property
+ def fmt_streams(self):
+ """Returns a list of streams if they have been initialized.
+
+ If the streams have not been initialized, finds all relevant
+ streams and initializes them.
+ """
+ self.check_availability()
+ if self._fmt_streams:
+ return self._fmt_streams
+
+ self._fmt_streams = []
+
+ stream_manifest = extract.apply_descrambler(self.streaming_data)
+
+ # If the cached js doesn't work, try fetching a new js file
+ # https://github.com/pytube/pytube/issues/1054
+ try:
+ extract.apply_signature(stream_manifest, self.vid_info, self.js)
+ except exceptions.ExtractError:
+ # To force an update to the js file, we clear the cache and retry
+ self._js = None
+ self._js_url = None
+ pytube.__js__ = None
+ pytube.__js_url__ = None
+ extract.apply_signature(stream_manifest, self.vid_info, self.js)
+
+ # build instances of :class:`Stream `
+ # Initialize stream objects
+ for stream in stream_manifest:
+ video = Stream(
+ stream=stream,
+ monostate=self.stream_monostate,
+ )
+ self._fmt_streams.append(video)
+
+ self.stream_monostate.title = self.title
+ self.stream_monostate.duration = self.length
+
+ return self._fmt_streams
+
+ def check_availability(self):
+ """Check whether the video is available.
+
+ Raises different exceptions based on why the video is unavailable,
+ otherwise does nothing.
+ """
+ status, messages = extract.playability_status(self.watch_html)
+
+ for reason in messages:
+ if status == 'UNPLAYABLE':
+ if reason == (
+ 'Join this channel to get access to members-only content '
+ 'like this video, and other exclusive perks.'
+ ):
+ raise exceptions.MembersOnly(video_id=self.video_id)
+ elif reason == 'This live stream recording is not available.':
+ raise exceptions.RecordingUnavailable(video_id=self.video_id)
+ else:
+ raise exceptions.VideoUnavailable(video_id=self.video_id)
+ elif status == 'LOGIN_REQUIRED':
+ if reason == (
+ 'This is a private video. '
+ 'Please sign in to verify that you may see it.'
+ ):
+ raise exceptions.VideoPrivate(video_id=self.video_id)
+ elif status == 'ERROR':
+ if reason == 'Video unavailable':
+ raise exceptions.VideoUnavailable(video_id=self.video_id)
+ elif status == 'LIVE_STREAM':
+ raise exceptions.LiveStreamError(video_id=self.video_id)
+
+ @property
+ def vid_info(self):
+ """Parse the raw vid info and return the parsed result.
+
+ :rtype: Dict[Any, Any]
+ """
+ if self._vid_info:
+ return self._vid_info
+
+ innertube = InnerTube(use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache)
+
+ innertube_response = innertube.player(self.video_id)
+ self._vid_info = innertube_response
+ return self._vid_info
+
+ def bypass_age_gate(self):
+ """Attempt to update the vid_info by bypassing the age gate."""
+ innertube = InnerTube(
+ client='ANDROID_EMBED',
+ use_oauth=self.use_oauth,
+ allow_cache=self.allow_oauth_cache
+ )
+ innertube_response = innertube.player(self.video_id)
+
+ playability_status = innertube_response['playabilityStatus'].get('status', None)
+
+ # If we still can't access the video, raise an exception
+ # (tier 3 age restriction)
+ if playability_status == 'UNPLAYABLE':
+ raise exceptions.AgeRestrictedError(self.video_id)
+
+ self._vid_info = innertube_response
+
+ @property
+ def caption_tracks(self) -> List[pytube.Caption]:
+ """Get a list of :class:`Caption `.
+
+ :rtype: List[Caption]
+ """
+ raw_tracks = (
+ self.vid_info.get("captions", {})
+ .get("playerCaptionsTracklistRenderer", {})
+ .get("captionTracks", [])
+ )
+ return [pytube.Caption(track) for track in raw_tracks]
+
+ @property
+ def captions(self) -> pytube.CaptionQuery:
+ """Interface to query caption tracks.
+
+ :rtype: :class:`CaptionQuery `.
+ """
+ return pytube.CaptionQuery(self.caption_tracks)
+
+ @property
+ def streams(self) -> StreamQuery:
+ """Interface to query both adaptive (DASH) and progressive streams.
+
+ :rtype: :class:`StreamQuery `.
+ """
+ self.check_availability()
+ return StreamQuery(self.fmt_streams)
+
+ @property
+ def thumbnail_url(self) -> str:
+ """Get the thumbnail url image.
+
+ :rtype: str
+ """
+ thumbnail_details = (
+ self.vid_info.get("videoDetails", {})
+ .get("thumbnail", {})
+ .get("thumbnails")
+ )
+ if thumbnail_details:
+ thumbnail_details = thumbnail_details[-1] # last item has max size
+ return thumbnail_details["url"]
+
+ return f"https://img.youtube.com/vi/{self.video_id}/maxresdefault.jpg"
+
+ @property
+ def publish_date(self):
+ """Get the publish date.
+
+ :rtype: datetime
+ """
+ if self._publish_date:
+ return self._publish_date
+ self._publish_date = extract.publish_date(self.watch_html)
+ return self._publish_date
+
+ @publish_date.setter
+ def publish_date(self, value):
+ """Sets the publish date."""
+ self._publish_date = value
+
+ @property
+ def title(self) -> str:
+ """Get the video title.
+
+ :rtype: str
+ """
+ if self._title:
+ return self._title
+
+ try:
+ self._title = self.vid_info['videoDetails']['title']
+ except KeyError:
+ # Check_availability will raise the correct exception in most cases
+ # if it doesn't, ask for a report.
+ self.check_availability()
+ raise exceptions.PytubeError(
+ (
+ f'Exception while accessing title of {self.watch_url}. '
+ 'Please file a bug report at https://github.com/pytube/pytube'
+ )
+ )
+
+ return self._title
+
+ @title.setter
+ def title(self, value):
+ """Sets the title value."""
+ self._title = value
+
+ @property
+ def description(self) -> str:
+ """Get the video description.
+
+ :rtype: str
+ """
+ return self.vid_info.get("videoDetails", {}).get("shortDescription")
+
+ @property
+ def rating(self) -> float:
+ """Get the video average rating.
+
+ :rtype: float
+
+ """
+ return self.vid_info.get("videoDetails", {}).get("averageRating")
+
+ @property
+ def length(self) -> int:
+ """Get the video length in seconds.
+
+ :rtype: int
+ """
+ return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds'))
+
+ @property
+ def views(self) -> int:
+ """Get the number of the times the video has been viewed.
+
+ :rtype: int
+ """
+ return int(self.vid_info.get("videoDetails", {}).get("viewCount"))
+
+ @property
+ def author(self) -> str:
+ """Get the video author.
+ :rtype: str
+ """
+ if self._author:
+ return self._author
+ self._author = self.vid_info.get("videoDetails", {}).get(
+ "author", "unknown"
+ )
+ return self._author
+
+ @author.setter
+ def author(self, value):
+ """Set the video author."""
+ self._author = value
+
+ @property
+ def keywords(self) -> List[str]:
+ """Get the video keywords.
+
+ :rtype: List[str]
+ """
+ return self.vid_info.get('videoDetails', {}).get('keywords', [])
+
+ @property
+ def channel_id(self) -> str:
+ """Get the video poster's channel id.
+
+ :rtype: str
+ """
+ return self.vid_info.get('videoDetails', {}).get('channelId', None)
+
+ @property
+ def channel_url(self) -> str:
+ """Construct the channel url for the video's poster from the channel id.
+
+ :rtype: str
+ """
+ return f'https://www.youtube.com/channel/{self.channel_id}'
+
+ @property
+ def metadata(self) -> Optional[YouTubeMetadata]:
+ """Get the metadata for the video.
+
+ :rtype: YouTubeMetadata
+ """
+ if self._metadata:
+ return self._metadata
+ else:
+ self._metadata = extract.metadata(self.initial_data)
+ return self._metadata
+
+ def register_on_progress_callback(self, func: Callable[[Any, bytes, int], None]):
+ """Register a download progress callback function post initialization.
+
+ :param callable func:
+ A callback function that takes ``stream``, ``chunk``,
+ and ``bytes_remaining`` as parameters.
+
+ :rtype: None
+
+ """
+ self.stream_monostate.on_progress = func
+
+ def register_on_complete_callback(self, func: Callable[[Any, Optional[str]], None]):
+ """Register a download complete callback function post initialization.
+
+ :param callable func:
+ A callback function that takes ``stream`` and ``file_path``.
+
+ :rtype: None
+
+ """
+ self.stream_monostate.on_complete = func
diff --git a/pytube/__pycache__/__init__.cpython-310.pyc b/pytube/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84df328c97072dca22bd94ba9baf25218d48fa19
Binary files /dev/null and b/pytube/__pycache__/__init__.cpython-310.pyc differ
diff --git a/pytube/__pycache__/__init__.cpython-39.pyc b/pytube/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24ac25f376bc8f968311fa278f3b85eed7fe358f
Binary files /dev/null and b/pytube/__pycache__/__init__.cpython-39.pyc differ
diff --git a/pytube/__pycache__/__main__.cpython-310.pyc b/pytube/__pycache__/__main__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53f42fd7eb921301c84e6ec8bb940c8c7fa3958a
Binary files /dev/null and b/pytube/__pycache__/__main__.cpython-310.pyc differ
diff --git a/pytube/__pycache__/__main__.cpython-39.pyc b/pytube/__pycache__/__main__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02f607e04639c444259a17281ee72e7ae7eec83f
Binary files /dev/null and b/pytube/__pycache__/__main__.cpython-39.pyc differ
diff --git a/pytube/__pycache__/captions.cpython-310.pyc b/pytube/__pycache__/captions.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddd7f91b1518ccec73d418e192dbc94ec71da310
Binary files /dev/null and b/pytube/__pycache__/captions.cpython-310.pyc differ
diff --git a/pytube/__pycache__/captions.cpython-39.pyc b/pytube/__pycache__/captions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e8f189f4ff7582ecc55e0e141d53e96b8c51d20
Binary files /dev/null and b/pytube/__pycache__/captions.cpython-39.pyc differ
diff --git a/pytube/__pycache__/cipher.cpython-310.pyc b/pytube/__pycache__/cipher.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6511343c9f3b6d5ac241ef6717fb414060236302
Binary files /dev/null and b/pytube/__pycache__/cipher.cpython-310.pyc differ
diff --git a/pytube/__pycache__/cipher.cpython-39.pyc b/pytube/__pycache__/cipher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39943e899c5b55ee337acba1e8100437de47dd8e
Binary files /dev/null and b/pytube/__pycache__/cipher.cpython-39.pyc differ
diff --git a/pytube/__pycache__/exceptions.cpython-310.pyc b/pytube/__pycache__/exceptions.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32a7fa8879bdff838155b800052e283d8e031abb
Binary files /dev/null and b/pytube/__pycache__/exceptions.cpython-310.pyc differ
diff --git a/pytube/__pycache__/exceptions.cpython-39.pyc b/pytube/__pycache__/exceptions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..445f03922e73c716d27c5cf82aef14d6394aceac
Binary files /dev/null and b/pytube/__pycache__/exceptions.cpython-39.pyc differ
diff --git a/pytube/__pycache__/extract.cpython-310.pyc b/pytube/__pycache__/extract.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eef71d15ea99f7b2ce9bdabd4d3894835bc6ce41
Binary files /dev/null and b/pytube/__pycache__/extract.cpython-310.pyc differ
diff --git a/pytube/__pycache__/extract.cpython-39.pyc b/pytube/__pycache__/extract.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..222522ec15e32abd3e6c0fdcb6df2c76ef7eb08c
Binary files /dev/null and b/pytube/__pycache__/extract.cpython-39.pyc differ
diff --git a/pytube/__pycache__/helpers.cpython-310.pyc b/pytube/__pycache__/helpers.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9feebcb11892130cad8ed7f69fa59484d65cf4e
Binary files /dev/null and b/pytube/__pycache__/helpers.cpython-310.pyc differ
diff --git a/pytube/__pycache__/helpers.cpython-39.pyc b/pytube/__pycache__/helpers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fec63e34d3ce39e5c70766497a5e747a2a71c9d
Binary files /dev/null and b/pytube/__pycache__/helpers.cpython-39.pyc differ
diff --git a/pytube/__pycache__/innertube.cpython-310.pyc b/pytube/__pycache__/innertube.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..826b415b90c19a734bcb3f621779f6eed3cb26ff
Binary files /dev/null and b/pytube/__pycache__/innertube.cpython-310.pyc differ
diff --git a/pytube/__pycache__/innertube.cpython-39.pyc b/pytube/__pycache__/innertube.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3da0d221cd74b1ebbf3593a75d225665580fc87a
Binary files /dev/null and b/pytube/__pycache__/innertube.cpython-39.pyc differ
diff --git a/pytube/__pycache__/itags.cpython-310.pyc b/pytube/__pycache__/itags.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..022a2505df1ebc43c2530118a99a8470c2e67341
Binary files /dev/null and b/pytube/__pycache__/itags.cpython-310.pyc differ
diff --git a/pytube/__pycache__/itags.cpython-39.pyc b/pytube/__pycache__/itags.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..565ab774167f16e3e718fdd215a5beb08158a50c
Binary files /dev/null and b/pytube/__pycache__/itags.cpython-39.pyc differ
diff --git a/pytube/__pycache__/metadata.cpython-310.pyc b/pytube/__pycache__/metadata.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..494ee1cf075e84d51a1b63efc1f0d3a54f3e5e7e
Binary files /dev/null and b/pytube/__pycache__/metadata.cpython-310.pyc differ
diff --git a/pytube/__pycache__/metadata.cpython-39.pyc b/pytube/__pycache__/metadata.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd0b5da02bfb1fa8fec2ffd4f8c703feb77423b1
Binary files /dev/null and b/pytube/__pycache__/metadata.cpython-39.pyc differ
diff --git a/pytube/__pycache__/monostate.cpython-310.pyc b/pytube/__pycache__/monostate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a6b92092689f4f2151b3318b1f38e62aecc6414
Binary files /dev/null and b/pytube/__pycache__/monostate.cpython-310.pyc differ
diff --git a/pytube/__pycache__/monostate.cpython-39.pyc b/pytube/__pycache__/monostate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ce8bd127834f4bc62a7212cfd725e7ef0f21d79
Binary files /dev/null and b/pytube/__pycache__/monostate.cpython-39.pyc differ
diff --git a/pytube/__pycache__/parser.cpython-310.pyc b/pytube/__pycache__/parser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1acfc4137454b77b841b6ce8ea08a12e68b4459e
Binary files /dev/null and b/pytube/__pycache__/parser.cpython-310.pyc differ
diff --git a/pytube/__pycache__/parser.cpython-39.pyc b/pytube/__pycache__/parser.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae93b7ac4c9f49156f75bdff0775cd739b3fd2aa
Binary files /dev/null and b/pytube/__pycache__/parser.cpython-39.pyc differ
diff --git a/pytube/__pycache__/query.cpython-310.pyc b/pytube/__pycache__/query.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9c2752c9d6774f5409951cc70c5dc725b7eba8d
Binary files /dev/null and b/pytube/__pycache__/query.cpython-310.pyc differ
diff --git a/pytube/__pycache__/query.cpython-39.pyc b/pytube/__pycache__/query.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aed2d037c520df540cdabf711befa97e8679b9e7
Binary files /dev/null and b/pytube/__pycache__/query.cpython-39.pyc differ
diff --git a/pytube/__pycache__/request.cpython-310.pyc b/pytube/__pycache__/request.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..797d0f1f8c3687e47460dfb4963e2b2e11612bb3
Binary files /dev/null and b/pytube/__pycache__/request.cpython-310.pyc differ
diff --git a/pytube/__pycache__/request.cpython-39.pyc b/pytube/__pycache__/request.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e7fa35dd40236f71da621d849b1662f09002c1d
Binary files /dev/null and b/pytube/__pycache__/request.cpython-39.pyc differ
diff --git a/pytube/__pycache__/streams.cpython-310.pyc b/pytube/__pycache__/streams.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..702e55b92d65a1c831167db5c856e85fdf833533
Binary files /dev/null and b/pytube/__pycache__/streams.cpython-310.pyc differ
diff --git a/pytube/__pycache__/streams.cpython-39.pyc b/pytube/__pycache__/streams.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5156381cbc356ce1e894313e5d441b83fca5bb4d
Binary files /dev/null and b/pytube/__pycache__/streams.cpython-39.pyc differ
diff --git a/pytube/__pycache__/version.cpython-310.pyc b/pytube/__pycache__/version.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5494b09c76b42d6cdc098bac518b832a0e22bbb
Binary files /dev/null and b/pytube/__pycache__/version.cpython-310.pyc differ
diff --git a/pytube/__pycache__/version.cpython-39.pyc b/pytube/__pycache__/version.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4d9190afb08811e4303014b684c53d65a0cba3a
Binary files /dev/null and b/pytube/__pycache__/version.cpython-39.pyc differ
diff --git a/pytube/captions.py b/pytube/captions.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed55f9a2a3083d8d75f8611967a3b49666c66eba
--- /dev/null
+++ b/pytube/captions.py
@@ -0,0 +1,154 @@
+import math
+import os
+import time
+import xml.etree.ElementTree as ElementTree
+from html import unescape
+from typing import Dict, Optional
+
+from pytube import request
+from pytube.helpers import safe_filename, target_directory
+
+
+class Caption:
+ """Container for caption tracks."""
+
+ def __init__(self, caption_track: Dict):
+ """Construct a :class:`Caption `.
+
+ :param dict caption_track:
+ Caption track data extracted from ``watch_html``.
+ """
+ self.url = caption_track.get("baseUrl")
+
+ # Certain videos have runs instead of simpleText
+ # this handles that edge case
+ name_dict = caption_track['name']
+ if 'simpleText' in name_dict:
+ self.name = name_dict['simpleText']
+ else:
+ for el in name_dict['runs']:
+ if 'text' in el:
+ self.name = el['text']
+
+ # Use "vssId" instead of "languageCode", fix issue #779
+ self.code = caption_track["vssId"]
+ # Remove preceding '.' for backwards compatibility, e.g.:
+ # English -> vssId: .en, languageCode: en
+ # English (auto-generated) -> vssId: a.en, languageCode: en
+ self.code = self.code.strip('.')
+
+ @property
+ def xml_captions(self) -> str:
+ """Download the xml caption tracks."""
+ return request.get(self.url)
+
+ def generate_srt_captions(self) -> str:
+ """Generate "SubRip Subtitle" captions.
+
+ Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
+ recompiles them into the "SubRip Subtitle" format.
+ """
+ return self.xml_caption_to_srt(self.xml_captions)
+
+ @staticmethod
+ def float_to_srt_time_format(d: float) -> str:
+ """Convert decimal durations into proper srt format.
+
+ :rtype: str
+ :returns:
+ SubRip Subtitle (str) formatted time duration.
+
+ float_to_srt_time_format(3.89) -> '00:00:03,890'
+ """
+ fraction, whole = math.modf(d)
+ time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
+ ms = f"{fraction:.3f}".replace("0.", "")
+ return time_fmt + ms
+
+ def xml_caption_to_srt(self, xml_captions: str) -> str:
+ """Convert xml caption tracks to "SubRip Subtitle (srt)".
+
+ :param str xml_captions:
+ XML formatted caption tracks.
+ """
+ segments = []
+ root = ElementTree.fromstring(xml_captions)
+ for i, child in enumerate(list(root)):
+ text = child.text or ""
+ caption = unescape(text.replace("\n", " ").replace(" ", " "),)
+ try:
+ duration = float(child.attrib["dur"])
+ except KeyError:
+ duration = 0.0
+ start = float(child.attrib["start"])
+ end = start + duration
+ sequence_number = i + 1 # convert from 0-indexed to 1.
+ line = "{seq}\n{start} --> {end}\n{text}\n".format(
+ seq=sequence_number,
+ start=self.float_to_srt_time_format(start),
+ end=self.float_to_srt_time_format(end),
+ text=caption,
+ )
+ segments.append(line)
+ return "\n".join(segments).strip()
+
+ def download(
+ self,
+ title: str,
+ srt: bool = True,
+ output_path: Optional[str] = None,
+ filename_prefix: Optional[str] = None,
+ ) -> str:
+ """Write the media stream to disk.
+
+ :param title:
+ Output filename (stem only) for writing media file.
+ If one is not specified, the default filename is used.
+ :type title: str
+ :param srt:
+ Set to True to download srt, false to download xml. Defaults to True.
+ :type srt bool
+ :param output_path:
+ (optional) Output path for writing media file. If one is not
+ specified, defaults to the current working directory.
+ :type output_path: str or None
+ :param filename_prefix:
+ (optional) A string that will be prepended to the filename.
+ For example a number in a playlist or the name of a series.
+ If one is not specified, nothing will be prepended
+ This is separate from filename so you can use the default
+ filename but still add a prefix.
+ :type filename_prefix: str or None
+
+ :rtype: str
+ """
+ if title.endswith(".srt") or title.endswith(".xml"):
+ filename = ".".join(title.split(".")[:-1])
+ else:
+ filename = title
+
+ if filename_prefix:
+ filename = f"{safe_filename(filename_prefix)}{filename}"
+
+ filename = safe_filename(filename)
+
+ filename += f" ({self.code})"
+
+ if srt:
+ filename += ".srt"
+ else:
+ filename += ".xml"
+
+ file_path = os.path.join(target_directory(output_path), filename)
+
+ with open(file_path, "w", encoding="utf-8") as file_handle:
+ if srt:
+ file_handle.write(self.generate_srt_captions())
+ else:
+ file_handle.write(self.xml_captions)
+
+ return file_path
+
+ def __repr__(self):
+ """Printable object representation."""
+ return ''.format(s=self)
diff --git a/pytube/cipher.py b/pytube/cipher.py
new file mode 100644
index 0000000000000000000000000000000000000000..d385d83a491b76ab44e0b464186cd374c7e14e69
--- /dev/null
+++ b/pytube/cipher.py
@@ -0,0 +1,697 @@
+"""
+This module contains all logic necessary to decipher the signature.
+
+YouTube's strategy to restrict downloading videos is to send a ciphered version
+of the signature to the client, along with the decryption algorithm obfuscated
+in JavaScript. For the clients to play the videos, JavaScript must take the
+ciphered version, cycle it through a series of "transform functions," and then
+signs the media URL with the output.
+
+This module is responsible for (1) finding and extracting those "transform
+functions" (2) maps them to Python equivalents and (3) taking the ciphered
+signature and decoding it.
+
+"""
+import logging
+import re
+from itertools import chain
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from pytube.exceptions import ExtractError, RegexMatchError
+from pytube.helpers import cache, regex_search
+from pytube.parser import find_object_from_startpoint, throttling_array_split
+
+logger = logging.getLogger(__name__)
+
+
+class Cipher:
+ def __init__(self, js: str):
+ self.transform_plan: List[str] = get_transform_plan(js)
+ var_regex = re.compile(r"^\w+\W")
+ var_match = var_regex.search(self.transform_plan[0])
+ if not var_match:
+ raise RegexMatchError(
+ caller="__init__", pattern=var_regex.pattern
+ )
+ var = var_match.group(0)[:-1]
+ self.transform_map = get_transform_map(js, var)
+ self.js_func_patterns = [
+ r"\w+\.(\w+)\(\w,(\d+)\)",
+ r"\w+\[(\"\w+\")\]\(\w,(\d+)\)"
+ ]
+
+ self.throttling_plan = get_throttling_plan(js)
+ self.throttling_array = get_throttling_function_array(js)
+
+ self.calculated_n = None
+
+ def calculate_n(self, initial_n: list):
+ """Converts n to the correct value to prevent throttling."""
+ if self.calculated_n:
+ return self.calculated_n
+
+ # First, update all instances of 'b' with the list(initial_n)
+ for i in range(len(self.throttling_array)):
+ if self.throttling_array[i] == 'b':
+ self.throttling_array[i] = initial_n
+
+ for step in self.throttling_plan:
+ curr_func = self.throttling_array[int(step[0])]
+ if not callable(curr_func):
+ logger.debug(f'{curr_func} is not callable.')
+ logger.debug(f'Throttling array:\n{self.throttling_array}\n')
+ raise ExtractError(f'{curr_func} is not callable.')
+
+ first_arg = self.throttling_array[int(step[1])]
+
+ if len(step) == 2:
+ curr_func(first_arg)
+ elif len(step) == 3:
+ second_arg = self.throttling_array[int(step[2])]
+ curr_func(first_arg, second_arg)
+
+ self.calculated_n = ''.join(initial_n)
+ return self.calculated_n
+
+ def get_signature(self, ciphered_signature: str) -> str:
+ """Decipher the signature.
+
+ Taking the ciphered signature, applies the transform functions.
+
+ :param str ciphered_signature:
+ The ciphered signature sent in the ``player_config``.
+ :rtype: str
+ :returns:
+ Decrypted signature required to download the media content.
+ """
+ signature = list(ciphered_signature)
+
+ for js_func in self.transform_plan:
+ name, argument = self.parse_function(js_func) # type: ignore
+ signature = self.transform_map[name](signature, argument)
+ logger.debug(
+ "applied transform function\n"
+ "output: %s\n"
+ "js_function: %s\n"
+ "argument: %d\n"
+ "function: %s",
+ "".join(signature),
+ name,
+ argument,
+ self.transform_map[name],
+ )
+
+ return "".join(signature)
+
+ @cache
+ def parse_function(self, js_func: str) -> Tuple[str, int]:
+ """Parse the Javascript transform function.
+
+ Break a JavaScript transform function down into a two element ``tuple``
+ containing the function name and some integer-based argument.
+
+ :param str js_func:
+ The JavaScript version of the transform function.
+ :rtype: tuple
+ :returns:
+ two element tuple containing the function name and an argument.
+
+ **Example**:
+
+ parse_function('DE.AJ(a,15)')
+ ('AJ', 15)
+
+ """
+ logger.debug("parsing transform function")
+ for pattern in self.js_func_patterns:
+ regex = re.compile(pattern)
+ parse_match = regex.search(js_func)
+ if parse_match:
+ fn_name, fn_arg = parse_match.groups()
+ return fn_name, int(fn_arg)
+
+ raise RegexMatchError(
+ caller="parse_function", pattern="js_func_patterns"
+ )
+
+
+def get_initial_function_name(js: str) -> str:
+ """Extract the name of the function responsible for computing the signature.
+ :param str js:
+ The contents of the base.js asset file.
+ :rtype: str
+ :returns:
+ Function name from regex match
+ """
+
+ function_patterns = [
+ r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
+ r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501
+ r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(',
+ r"\.sig\|\|(?P[a-zA-Z0-9$]+)\(",
+ r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501
+ ]
+ logger.debug("finding initial function name")
+ for pattern in function_patterns:
+ regex = re.compile(pattern)
+ function_match = regex.search(js)
+ if function_match:
+ logger.debug("finished regex search, matched: %s", pattern)
+ return function_match.group(1)
+
+ raise RegexMatchError(
+ caller="get_initial_function_name", pattern="multiple"
+ )
+
+
+def get_transform_plan(js: str) -> List[str]:
+ """Extract the "transform plan".
+
+ The "transform plan" is the functions that the ciphered signature is
+ cycled through to obtain the actual signature.
+
+ :param str js:
+ The contents of the base.js asset file.
+
+ **Example**:
+
+ ['DE.AJ(a,15)',
+ 'DE.VR(a,3)',
+ 'DE.AJ(a,51)',
+ 'DE.VR(a,3)',
+ 'DE.kT(a,51)',
+ 'DE.kT(a,8)',
+ 'DE.VR(a,3)',
+ 'DE.kT(a,21)']
+ """
+ name = re.escape(get_initial_function_name(js))
+ pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name
+ logger.debug("getting transform plan")
+ return regex_search(pattern, js, group=1).split(";")
+
+
+def get_transform_object(js: str, var: str) -> List[str]:
+ """Extract the "transform object".
+
+ The "transform object" contains the function definitions referenced in the
+ "transform plan". The ``var`` argument is the obfuscated variable name
+ which contains these functions, for example, given the function call
+ ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var.
+
+ :param str js:
+ The contents of the base.js asset file.
+ :param str var:
+ The obfuscated variable name that stores an object with all functions
+ that descrambles the signature.
+
+ **Example**:
+
+ >>> get_transform_object(js, 'DE')
+ ['AJ:function(a){a.reverse()}',
+ 'VR:function(a,b){a.splice(0,b)}',
+ 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}']
+
+ """
+ pattern = r"var %s={(.*?)};" % re.escape(var)
+ logger.debug("getting transform object")
+ regex = re.compile(pattern, flags=re.DOTALL)
+ transform_match = regex.search(js)
+ if not transform_match:
+ raise RegexMatchError(caller="get_transform_object", pattern=pattern)
+
+ return transform_match.group(1).replace("\n", " ").split(", ")
+
+
+def get_transform_map(js: str, var: str) -> Dict:
+ """Build a transform function lookup.
+
+ Build a lookup table of obfuscated JavaScript function names to the
+ Python equivalents.
+
+ :param str js:
+ The contents of the base.js asset file.
+ :param str var:
+ The obfuscated variable name that stores an object with all functions
+ that descrambles the signature.
+
+ """
+ transform_object = get_transform_object(js, var)
+ mapper = {}
+ for obj in transform_object:
+ # AJ:function(a){a.reverse()} => AJ, function(a){a.reverse()}
+ name, function = obj.split(":", 1)
+ fn = map_functions(function)
+ mapper[name] = fn
+ return mapper
+
+
+def get_throttling_function_name(js: str) -> str:
+ """Extract the name of the function that computes the throttling parameter.
+
+ :param str js:
+ The contents of the base.js asset file.
+ :rtype: str
+ :returns:
+ The name of the function used to compute the throttling parameter.
+ """
+ function_patterns = [
+ # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
+ # https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
+ # var Bpa = [iha];
+ # ...
+ # a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
+ # Bpa.length || iha("")) }};
+ # In the above case, `iha` is the relevant function name
+ r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
+ r'\([a-z]\s*=\s*([a-zA-Z0-9$]+)(\[\d+\])?\([a-z]\)',
+ ]
+ logger.debug('Finding throttling function name')
+ for pattern in function_patterns:
+ regex = re.compile(pattern)
+ function_match = regex.search(js)
+ if function_match:
+ logger.debug("finished regex search, matched: %s", pattern)
+ if len(function_match.groups()) == 1:
+ return function_match.group(1)
+ idx = function_match.group(2)
+ if idx:
+ idx = idx.strip("[]")
+ array = re.search(
+ r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
+ nfunc=re.escape(function_match.group(1))),
+ js
+ )
+ if array:
+ array = array.group(1).strip("[]").split(",")
+ array = [x.strip() for x in array]
+ return array[int(idx)]
+
+ raise RegexMatchError(
+ caller="get_throttling_function_name", pattern="multiple"
+ )
+
+
+def get_throttling_function_code(js: str) -> str:
+ """Extract the raw code for the throttling function.
+
+ :param str js:
+ The contents of the base.js asset file.
+ :rtype: str
+ :returns:
+ The name of the function used to compute the throttling parameter.
+ """
+ # Begin by extracting the correct function name
+ name = re.escape(get_throttling_function_name(js))
+
+ # Identify where the function is defined
+ pattern_start = r"%s=function\(\w\)" % name
+ regex = re.compile(pattern_start)
+ match = regex.search(js)
+
+ # Extract the code within curly braces for the function itself, and merge any split lines
+ code_lines_list = find_object_from_startpoint(js, match.span()[1]).split('\n')
+ joined_lines = "".join(code_lines_list)
+
+ # Prepend function definition (e.g. `Dea=function(a)`)
+ return match.group(0) + joined_lines
+
+
+def get_throttling_function_array(js: str) -> List[Any]:
+ """Extract the "c" array.
+
+ :param str js:
+ The contents of the base.js asset file.
+ :returns:
+ The array of various integers, arrays, and functions.
+ """
+ raw_code = get_throttling_function_code(js)
+
+ array_start = r",c=\["
+ array_regex = re.compile(array_start)
+ match = array_regex.search(raw_code)
+
+ array_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
+ str_array = throttling_array_split(array_raw)
+
+ converted_array = []
+ for el in str_array:
+ try:
+ converted_array.append(int(el))
+ continue
+ except ValueError:
+ # Not an integer value.
+ pass
+
+ if el == 'null':
+ converted_array.append(None)
+ continue
+
+ if el.startswith('"') and el.endswith('"'):
+ # Convert e.g. '"abcdef"' to string without quotation marks, 'abcdef'
+ converted_array.append(el[1:-1])
+ continue
+
+ if el.startswith('function'):
+ mapper = (
+ (r"{for\(\w=\(\w%\w\.length\+\w\.length\)%\w\.length;\w--;\)\w\.unshift\(\w.pop\(\)\)}", throttling_unshift), # noqa:E501
+ (r"{\w\.reverse\(\)}", throttling_reverse),
+ (r"{\w\.push\(\w\)}", throttling_push),
+ (r";var\s\w=\w\[0\];\w\[0\]=\w\[\w\];\w\[\w\]=\w}", throttling_swap),
+ (r"case\s\d+", throttling_cipher_function),
+ (r"\w\.splice\(0,1,\w\.splice\(\w,1,\w\[0\]\)\[0\]\)", throttling_nested_splice), # noqa:E501
+ (r";\w\.splice\(\w,1\)}", js_splice),
+ (r"\w\.splice\(-\w\)\.reverse\(\)\.forEach\(function\(\w\){\w\.unshift\(\w\)}\)", throttling_prepend), # noqa:E501
+ (r"for\(var \w=\w\.length;\w;\)\w\.push\(\w\.splice\(--\w,1\)\[0\]\)}", throttling_reverse), # noqa:E501
+ )
+
+ found = False
+ for pattern, fn in mapper:
+ if re.search(pattern, el):
+ converted_array.append(fn)
+ found = True
+ if found:
+ continue
+
+ converted_array.append(el)
+
+ # Replace null elements with array itself
+ for i in range(len(converted_array)):
+ if converted_array[i] is None:
+ converted_array[i] = converted_array
+
+ return converted_array
+
+
+def get_throttling_plan(js: str):
+ """Extract the "throttling plan".
+
+ The "throttling plan" is a list of tuples used for calling functions
+ in the c array. The first element of the tuple is the index of the
+ function to call, and any remaining elements of the tuple are arguments
+ to pass to that function.
+
+ :param str js:
+ The contents of the base.js asset file.
+ :returns:
+ The full function code for computing the throttlign parameter.
+ """
+ raw_code = get_throttling_function_code(js)
+
+ transform_start = r"try{"
+ plan_regex = re.compile(transform_start)
+ match = plan_regex.search(raw_code)
+
+ transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
+
+ # Steps are either c[x](c[y]) or c[x](c[y],c[z])
+ step_start = r"c\[(\d+)\]\(c\[(\d+)\](,c(\[(\d+)\]))?\)"
+ step_regex = re.compile(step_start)
+ matches = step_regex.findall(transform_plan_raw)
+ transform_steps = []
+ for match in matches:
+ if match[4] != '':
+ transform_steps.append((match[0],match[1],match[4]))
+ else:
+ transform_steps.append((match[0],match[1]))
+
+ return transform_steps
+
+
+def reverse(arr: List, _: Optional[Any]):
+ """Reverse elements in a list.
+
+ This function is equivalent to:
+
+ .. code-block:: javascript
+
+ function(a, b) { a.reverse() }
+
+ This method takes an unused ``b`` variable as their transform functions
+ universally sent two arguments.
+
+ **Example**:
+
+ >>> reverse([1, 2, 3, 4])
+ [4, 3, 2, 1]
+ """
+ return arr[::-1]
+
+
+def splice(arr: List, b: int):
+ """Add/remove items to/from a list.
+
+ This function is equivalent to:
+
+ .. code-block:: javascript
+
+ function(a, b) { a.splice(0, b) }
+
+ **Example**:
+
+ >>> splice([1, 2, 3, 4], 2)
+ [1, 2]
+ """
+ return arr[b:]
+
+
+def swap(arr: List, b: int):
+ """Swap positions at b modulus the list length.
+
+ This function is equivalent to:
+
+ .. code-block:: javascript
+
+ function(a, b) { var c=a[0];a[0]=a[b%a.length];a[b]=c }
+
+ **Example**:
+
+ >>> swap([1, 2, 3, 4], 2)
+ [3, 2, 1, 4]
+ """
+ r = b % len(arr)
+ return list(chain([arr[r]], arr[1:r], [arr[0]], arr[r + 1 :]))
+
+
+def throttling_reverse(arr: list):
+ """Reverses the input list.
+
+ Needs to do an in-place reversal so that the passed list gets changed.
+ To accomplish this, we create a reversed copy, and then change each
+ indvidual element.
+ """
+ reverse_copy = arr.copy()[::-1]
+ for i in range(len(reverse_copy)):
+ arr[i] = reverse_copy[i]
+
+
+def throttling_push(d: list, e: Any):
+ """Pushes an element onto a list."""
+ d.append(e)
+
+
+def throttling_mod_func(d: list, e: int):
+ """Perform the modular function from the throttling array functions.
+
+ In the javascript, the modular operation is as follows:
+ e = (e % d.length + d.length) % d.length
+
+ We simply translate this to python here.
+ """
+ return (e % len(d) + len(d)) % len(d)
+
+
+def throttling_unshift(d: list, e: int):
+ """Rotates the elements of the list to the right.
+
+ In the javascript, the operation is as follows:
+ for(e=(e%d.length+d.length)%d.length;e--;)d.unshift(d.pop())
+ """
+ e = throttling_mod_func(d, e)
+ new_arr = d[-e:] + d[:-e]
+ d.clear()
+ for el in new_arr:
+ d.append(el)
+
+
+def throttling_cipher_function(d: list, e: str):
+ """This ciphers d with e to generate a new list.
+
+ In the javascript, the operation is as follows:
+ var h = [A-Za-z0-9-_], f = 96; // simplified from switch-case loop
+ d.forEach(
+ function(l,m,n){
+ this.push(
+ n[m]=h[
+ (h.indexOf(l)-h.indexOf(this[m])+m-32+f--)%h.length
+ ]
+ )
+ },
+ e.split("")
+ )
+ """
+ h = list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_')
+ f = 96
+ # by naming it "this" we can more closely reflect the js
+ this = list(e)
+
+ # This is so we don't run into weirdness with enumerate while
+ # we change the input list
+ copied_list = d.copy()
+
+ for m, l in enumerate(copied_list):
+ bracket_val = (h.index(l) - h.index(this[m]) + m - 32 + f) % len(h)
+ this.append(
+ h[bracket_val]
+ )
+ d[m] = h[bracket_val]
+ f -= 1
+
+
+def throttling_nested_splice(d: list, e: int):
+ """Nested splice function in throttling js.
+
+ In the javascript, the operation is as follows:
+ function(d,e){
+ e=(e%d.length+d.length)%d.length;
+ d.splice(
+ 0,
+ 1,
+ d.splice(
+ e,
+ 1,
+ d[0]
+ )[0]
+ )
+ }
+
+ While testing, all this seemed to do is swap element 0 and e,
+ but the actual process is preserved in case there was an edge
+ case that was not considered.
+ """
+ e = throttling_mod_func(d, e)
+ inner_splice = js_splice(
+ d,
+ e,
+ 1,
+ d[0]
+ )
+ js_splice(
+ d,
+ 0,
+ 1,
+ inner_splice[0]
+ )
+
+
+def throttling_prepend(d: list, e: int):
+ """
+
+ In the javascript, the operation is as follows:
+ function(d,e){
+ e=(e%d.length+d.length)%d.length;
+ d.splice(-e).reverse().forEach(
+ function(f){
+ d.unshift(f)
+ }
+ )
+ }
+
+ Effectively, this moves the last e elements of d to the beginning.
+ """
+ start_len = len(d)
+ # First, calculate e
+ e = throttling_mod_func(d, e)
+
+ # Then do the prepending
+ new_arr = d[-e:] + d[:-e]
+
+ # And update the input list
+ d.clear()
+ for el in new_arr:
+ d.append(el)
+
+ end_len = len(d)
+ assert start_len == end_len
+
+
+def throttling_swap(d: list, e: int):
+ """Swap positions of the 0'th and e'th elements in-place."""
+ e = throttling_mod_func(d, e)
+ f = d[0]
+ d[0] = d[e]
+ d[e] = f
+
+
+def js_splice(arr: list, start: int, delete_count=None, *items):
+ """Implementation of javascript's splice function.
+
+ :param list arr:
+ Array to splice
+ :param int start:
+ Index at which to start changing the array
+ :param int delete_count:
+ Number of elements to delete from the array
+ :param *items:
+ Items to add to the array
+
+ Reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice # noqa:E501
+ """
+ # Special conditions for start value
+ try:
+ if start > len(arr):
+ start = len(arr)
+ # If start is negative, count backwards from end
+ if start < 0:
+ start = len(arr) - start
+ except TypeError:
+ # Non-integer start values are treated as 0 in js
+ start = 0
+
+ # Special condition when delete_count is greater than remaining elements
+ if not delete_count or delete_count >= len(arr) - start:
+ delete_count = len(arr) - start # noqa: N806
+
+ deleted_elements = arr[start:start + delete_count]
+
+ # Splice appropriately.
+ new_arr = arr[:start] + list(items) + arr[start + delete_count:]
+
+ # Replace contents of input array
+ arr.clear()
+ for el in new_arr:
+ arr.append(el)
+
+ return deleted_elements
+
+
+def map_functions(js_func: str) -> Callable:
+ """For a given JavaScript transform function, return the Python equivalent.
+
+ :param str js_func:
+ The JavaScript version of the transform function.
+ """
+ mapper = (
+ # function(a){a.reverse()}
+ (r"{\w\.reverse\(\)}", reverse),
+ # function(a,b){a.splice(0,b)}
+ (r"{\w\.splice\(0,\w\)}", splice),
+ # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}
+ (r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\]=\w}", swap),
+ # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c}
+ (
+ r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\%\w.length\]=\w}",
+ swap,
+ ),
+ )
+
+ for pattern, fn in mapper:
+ if re.search(pattern, js_func):
+ return fn
+ raise RegexMatchError(caller="map_functions", pattern="multiple")
diff --git a/pytube/cli.py b/pytube/cli.py
new file mode 100755
index 0000000000000000000000000000000000000000..7a9885478a6403d6029430d43fa421c752b1e00f
--- /dev/null
+++ b/pytube/cli.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+"""A simple command line application to download youtube videos."""
+import argparse
+import gzip
+import json
+import logging
+import os
+import shutil
+import sys
+import datetime as dt
+import subprocess # nosec
+from typing import List, Optional
+
+import pytube.exceptions as exceptions
+from pytube import __version__
+from pytube import CaptionQuery, Playlist, Stream, YouTube
+from pytube.helpers import safe_filename, setup_logger
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+ """Command line application to download youtube videos."""
+ # noinspection PyTypeChecker
+ parser = argparse.ArgumentParser(description=main.__doc__)
+ args = _parse_args(parser)
+ if args.verbose:
+ log_filename = None
+ if args.logfile:
+ log_filename = args.logfile
+ setup_logger(logging.DEBUG, log_filename=log_filename)
+ logger.debug(f'Pytube version: {__version__}')
+
+ if not args.url or "youtu" not in args.url:
+ parser.print_help()
+ sys.exit(1)
+
+ if "/playlist" in args.url:
+ print("Loading playlist...")
+ playlist = Playlist(args.url)
+ if not args.target:
+ args.target = safe_filename(playlist.title)
+ for youtube_video in playlist.videos:
+ try:
+ _perform_args_on_youtube(youtube_video, args)
+ except exceptions.PytubeError as e:
+ print(f"There was an error with video: {youtube_video}")
+ print(e)
+ else:
+ print("Loading video...")
+ youtube = YouTube(args.url)
+ _perform_args_on_youtube(youtube, args)
+
+
+def _perform_args_on_youtube(
+ youtube: YouTube, args: argparse.Namespace
+) -> None:
+ if len(sys.argv) == 2 : # no arguments parsed
+ download_highest_resolution_progressive(
+ youtube=youtube, resolution="highest", target=args.target
+ )
+ if args.list_captions:
+ _print_available_captions(youtube.captions)
+ if args.list:
+ display_streams(youtube)
+ if args.build_playback_report:
+ build_playback_report(youtube)
+ if args.itag:
+ download_by_itag(youtube=youtube, itag=args.itag, target=args.target)
+ if args.caption_code:
+ download_caption(
+ youtube=youtube, lang_code=args.caption_code, target=args.target
+ )
+ if args.resolution:
+ download_by_resolution(
+ youtube=youtube, resolution=args.resolution, target=args.target
+ )
+ if args.audio:
+ download_audio(
+ youtube=youtube, filetype=args.audio, target=args.target
+ )
+ if args.ffmpeg:
+ ffmpeg_process(
+ youtube=youtube, resolution=args.ffmpeg, target=args.target
+ )
+
+
+def _parse_args(
+ parser: argparse.ArgumentParser, args: Optional[List] = None
+) -> argparse.Namespace:
+ parser.add_argument(
+ "url", help="The YouTube /watch or /playlist url", nargs="?"
+ )
+ parser.add_argument(
+ "--version", action="version", version="%(prog)s " + __version__,
+ )
+ parser.add_argument(
+ "--itag", type=int, help="The itag for the desired stream",
+ )
+ parser.add_argument(
+ "-r",
+ "--resolution",
+ type=str,
+ help="The resolution for the desired stream",
+ )
+ parser.add_argument(
+ "-l",
+ "--list",
+ action="store_true",
+ help=(
+ "The list option causes pytube cli to return a list of streams "
+ "available to download"
+ ),
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ dest="verbose",
+ help="Set logger output to verbose output.",
+ )
+ parser.add_argument(
+ "--logfile",
+ action="store",
+ help="logging debug and error messages into a log file",
+ )
+ parser.add_argument(
+ "--build-playback-report",
+ action="store_true",
+ help="Save the html and js to disk",
+ )
+ parser.add_argument(
+ "-c",
+ "--caption-code",
+ type=str,
+ help=(
+ "Download srt captions for given language code. "
+ "Prints available language codes if no argument given"
+ ),
+ )
+ parser.add_argument(
+ '-lc',
+ '--list-captions',
+ action='store_true',
+ help=(
+ "List available caption codes for a video"
+ )
+ )
+ parser.add_argument(
+ "-t",
+ "--target",
+ help=(
+ "The output directory for the downloaded stream. "
+ "Default is current working directory"
+ ),
+ )
+ parser.add_argument(
+ "-a",
+ "--audio",
+ const="mp4",
+ nargs="?",
+ help=(
+ "Download the audio for a given URL at the highest bitrate available"
+ "Defaults to mp4 format if none is specified"
+ ),
+ )
+ parser.add_argument(
+ "-f",
+ "--ffmpeg",
+ const="best",
+ nargs="?",
+ help=(
+ "Downloads the audio and video stream for resolution provided"
+ "If no resolution is provided, downloads the best resolution"
+ "Runs the command line program ffmpeg to combine the audio and video"
+ ),
+ )
+
+ return parser.parse_args(args)
+
+
+def build_playback_report(youtube: YouTube) -> None:
+ """Serialize the request data to json for offline debugging.
+
+ :param YouTube youtube:
+ A YouTube object.
+ """
+ ts = int(dt.datetime.utcnow().timestamp())
+ fp = os.path.join(os.getcwd(), f"yt-video-{youtube.video_id}-{ts}.json.gz")
+
+ js = youtube.js
+ watch_html = youtube.watch_html
+ vid_info = youtube.vid_info
+
+ with gzip.open(fp, "wb") as fh:
+ fh.write(
+ json.dumps(
+ {
+ "url": youtube.watch_url,
+ "js": js,
+ "watch_html": watch_html,
+ "video_info": vid_info,
+ }
+ ).encode("utf8"),
+ )
+
+
+def display_progress_bar(
+ bytes_received: int, filesize: int, ch: str = "█", scale: float = 0.55
+) -> None:
+ """Display a simple, pretty progress bar.
+
+ Example:
+ ~~~~~~~~
+ PSY - GANGNAM STYLE(강남스타일) MV.mp4
+ ↳ |███████████████████████████████████████| 100.0%
+
+ :param int bytes_received:
+ The delta between the total file size (bytes) and bytes already
+ written to disk.
+ :param int filesize:
+ File size of the media stream in bytes.
+ :param str ch:
+ Character to use for presenting progress segment.
+ :param float scale:
+ Scale multiplier to reduce progress bar size.
+
+ """
+ columns = shutil.get_terminal_size().columns
+ max_width = int(columns * scale)
+
+ filled = int(round(max_width * bytes_received / float(filesize)))
+ remaining = max_width - filled
+ progress_bar = ch * filled + " " * remaining
+ percent = round(100.0 * bytes_received / float(filesize), 1)
+ text = f" ↳ |{progress_bar}| {percent}%\r"
+ sys.stdout.write(text)
+ sys.stdout.flush()
+
+
+# noinspection PyUnusedLocal
+def on_progress(
+ stream: Stream, chunk: bytes, bytes_remaining: int
+) -> None: # pylint: disable=W0613
+ filesize = stream.filesize
+ bytes_received = filesize - bytes_remaining
+ display_progress_bar(bytes_received, filesize)
+
+
+def _download(
+ stream: Stream,
+ target: Optional[str] = None,
+ filename: Optional[str] = None,
+) -> None:
+ filesize_megabytes = stream.filesize // 1048576
+ print(f"{filename or stream.default_filename} | {filesize_megabytes} MB")
+ file_path = stream.get_file_path(filename=filename, output_path=target)
+ if stream.exists_at_path(file_path):
+ print(f"Already downloaded at:\n{file_path}")
+ return
+
+ stream.download(output_path=target, filename=filename)
+ sys.stdout.write("\n")
+
+
+def _unique_name(base: str, subtype: str, media_type: str, target: str) -> str:
+ """
+ Given a base name, the file format, and the target directory, will generate
+ a filename unique for that directory and file format.
+ :param str base:
+ The given base-name.
+ :param str subtype:
+ The filetype of the video which will be downloaded.
+ :param str media_type:
+ The media_type of the file, ie. "audio" or "video"
+ :param Path target:
+ Target directory for download.
+ """
+ counter = 0
+ while True:
+ file_name = f"{base}_{media_type}_{counter}"
+ file_path = os.path.join(target, f"{file_name}.{subtype}")
+ if not os.path.exists(file_path):
+ return file_name
+ counter += 1
+
+
+def ffmpeg_process(
+ youtube: YouTube, resolution: str, target: Optional[str] = None
+) -> None:
+ """
+ Decides the correct video stream to download, then calls _ffmpeg_downloader.
+
+ :param YouTube youtube:
+ A valid YouTube object.
+ :param str resolution:
+ YouTube video resolution.
+ :param str target:
+ Target directory for download
+ """
+ youtube.register_on_progress_callback(on_progress)
+ target = target or os.getcwd()
+
+ if resolution == "best":
+ highest_quality_stream = (
+ youtube.streams.filter(progressive=False)
+ .order_by("resolution")
+ .last()
+ )
+ mp4_stream = (
+ youtube.streams.filter(progressive=False, subtype="mp4")
+ .order_by("resolution")
+ .last()
+ )
+ if highest_quality_stream.resolution == mp4_stream.resolution:
+ video_stream = mp4_stream
+ else:
+ video_stream = highest_quality_stream
+ else:
+ video_stream = youtube.streams.filter(
+ progressive=False, resolution=resolution, subtype="mp4"
+ ).first()
+ if not video_stream:
+ video_stream = youtube.streams.filter(
+ progressive=False, resolution=resolution
+ ).first()
+ if video_stream is None:
+ print(f"Could not find a stream with resolution: {resolution}")
+ print("Try one of these:")
+ display_streams(youtube)
+ sys.exit()
+
+ audio_stream = youtube.streams.get_audio_only(video_stream.subtype)
+ if not audio_stream:
+ audio_stream = (
+ youtube.streams.filter(only_audio=True).order_by("abr").last()
+ )
+ if not audio_stream:
+ print("Could not find an audio only stream")
+ sys.exit()
+ _ffmpeg_downloader(
+ audio_stream=audio_stream, video_stream=video_stream, target=target
+ )
+
+
+def _ffmpeg_downloader(
+ audio_stream: Stream, video_stream: Stream, target: str
+) -> None:
+ """
+ Given a YouTube Stream object, finds the correct audio stream, downloads them both
+ giving them a unique name, them uses ffmpeg to create a new file with the audio
+ and video from the previously downloaded files. Then deletes the original adaptive
+ streams, leaving the combination.
+
+ :param Stream audio_stream:
+ A valid Stream object representing the audio to download
+ :param Stream video_stream:
+ A valid Stream object representing the video to download
+ :param Path target:
+ A valid Path object
+ """
+ video_unique_name = _unique_name(
+ safe_filename(video_stream.title),
+ video_stream.subtype,
+ "video",
+ target=target,
+ )
+ audio_unique_name = _unique_name(
+ safe_filename(video_stream.title),
+ audio_stream.subtype,
+ "audio",
+ target=target,
+ )
+ _download(stream=video_stream, target=target, filename=video_unique_name)
+ print("Loading audio...")
+ _download(stream=audio_stream, target=target, filename=audio_unique_name)
+
+ video_path = os.path.join(
+ target, f"{video_unique_name}.{video_stream.subtype}"
+ )
+ audio_path = os.path.join(
+ target, f"{audio_unique_name}.{audio_stream.subtype}"
+ )
+ final_path = os.path.join(
+ target, f"{safe_filename(video_stream.title)}.{video_stream.subtype}"
+ )
+
+ subprocess.run( # nosec
+ [
+ "ffmpeg",
+ "-i",
+ video_path,
+ "-i",
+ audio_path,
+ "-codec",
+ "copy",
+ final_path,
+ ]
+ )
+ os.unlink(video_path)
+ os.unlink(audio_path)
+
+
+def download_by_itag(
+ youtube: YouTube, itag: int, target: Optional[str] = None
+) -> None:
+ """Start downloading a YouTube video.
+
+ :param YouTube youtube:
+ A valid YouTube object.
+ :param int itag:
+ YouTube format identifier code.
+ :param str target:
+ Target directory for download
+ """
+ stream = youtube.streams.get_by_itag(itag)
+ if stream is None:
+ print(f"Could not find a stream with itag: {itag}")
+ print("Try one of these:")
+ display_streams(youtube)
+ sys.exit()
+
+ youtube.register_on_progress_callback(on_progress)
+
+ try:
+ _download(stream, target=target)
+ except KeyboardInterrupt:
+ sys.exit()
+
+
+def download_by_resolution(
+ youtube: YouTube, resolution: str, target: Optional[str] = None
+) -> None:
+ """Start downloading a YouTube video.
+
+ :param YouTube youtube:
+ A valid YouTube object.
+ :param str resolution:
+ YouTube video resolution.
+ :param str target:
+ Target directory for download
+ """
+ # TODO(nficano): allow dash itags to be selected
+ stream = youtube.streams.get_by_resolution(resolution)
+ if stream is None:
+ print(f"Could not find a stream with resolution: {resolution}")
+ print("Try one of these:")
+ display_streams(youtube)
+ sys.exit()
+
+ youtube.register_on_progress_callback(on_progress)
+
+ try:
+ _download(stream, target=target)
+ except KeyboardInterrupt:
+ sys.exit()
+
+
+def download_highest_resolution_progressive(
+ youtube: YouTube, resolution: str, target: Optional[str] = None
+) -> None:
+ """Start downloading the highest resolution progressive stream.
+
+ :param YouTube youtube:
+ A valid YouTube object.
+ :param str resolution:
+ YouTube video resolution.
+ :param str target:
+ Target directory for download
+ """
+ youtube.register_on_progress_callback(on_progress)
+ try:
+ stream = youtube.streams.get_highest_resolution()
+ except exceptions.VideoUnavailable as err:
+ print(f"No video streams available: {err}")
+ else:
+ try:
+ _download(stream, target=target)
+ except KeyboardInterrupt:
+ sys.exit()
+
+
+def display_streams(youtube: YouTube) -> None:
+ """Probe YouTube video and lists its available formats.
+
+ :param YouTube youtube:
+ A valid YouTube watch URL.
+
+ """
+ for stream in youtube.streams:
+ print(stream)
+
+
+def _print_available_captions(captions: CaptionQuery) -> None:
+ print(
+ f"Available caption codes are: {', '.join(c.code for c in captions)}"
+ )
+
+
+def download_caption(
+ youtube: YouTube, lang_code: Optional[str], target: Optional[str] = None
+) -> None:
+ """Download a caption for the YouTube video.
+
+ :param YouTube youtube:
+ A valid YouTube object.
+ :param str lang_code:
+ Language code desired for caption file.
+ Prints available codes if the value is None
+ or the desired code is not available.
+ :param str target:
+ Target directory for download
+ """
+ try:
+ caption = youtube.captions[lang_code]
+ downloaded_path = caption.download(
+ title=youtube.title, output_path=target
+ )
+ print(f"Saved caption file to: {downloaded_path}")
+ except KeyError:
+ print(f"Unable to find caption with code: {lang_code}")
+ _print_available_captions(youtube.captions)
+
+
+def download_audio(
+ youtube: YouTube, filetype: str, target: Optional[str] = None
+) -> None:
+ """
+ Given a filetype, downloads the highest quality available audio stream for a
+ YouTube video.
+
+ :param YouTube youtube:
+ A valid YouTube object.
+ :param str filetype:
+ Desired file format to download.
+ :param str target:
+ Target directory for download
+ """
+ audio = (
+ youtube.streams.filter(only_audio=True, subtype=filetype)
+ .order_by("abr")
+ .last()
+ )
+
+ if audio is None:
+ print("No audio only stream found. Try one of these:")
+ display_streams(youtube)
+ sys.exit()
+
+ youtube.register_on_progress_callback(on_progress)
+
+ try:
+ _download(audio, target=target)
+ except KeyboardInterrupt:
+ sys.exit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pytube/contrib/__init__.py b/pytube/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pytube/contrib/__pycache__/__init__.cpython-310.pyc b/pytube/contrib/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..147e1b832be9f01eff46776e11b577d91599a3b3
Binary files /dev/null and b/pytube/contrib/__pycache__/__init__.cpython-310.pyc differ
diff --git a/pytube/contrib/__pycache__/__init__.cpython-39.pyc b/pytube/contrib/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..febbf2fddad0991e56249082e78517ca95e40b9e
Binary files /dev/null and b/pytube/contrib/__pycache__/__init__.cpython-39.pyc differ
diff --git a/pytube/contrib/__pycache__/channel.cpython-310.pyc b/pytube/contrib/__pycache__/channel.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f84ea4af1f983133d0d1871e13885205edae5faf
Binary files /dev/null and b/pytube/contrib/__pycache__/channel.cpython-310.pyc differ
diff --git a/pytube/contrib/__pycache__/channel.cpython-39.pyc b/pytube/contrib/__pycache__/channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dc5ea77242bc1610966c04ccff54311c01b2852
Binary files /dev/null and b/pytube/contrib/__pycache__/channel.cpython-39.pyc differ
diff --git a/pytube/contrib/__pycache__/playlist.cpython-310.pyc b/pytube/contrib/__pycache__/playlist.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..234b436b8b52ee2e67b5fbd099889dfbbe0681d9
Binary files /dev/null and b/pytube/contrib/__pycache__/playlist.cpython-310.pyc differ
diff --git a/pytube/contrib/__pycache__/playlist.cpython-39.pyc b/pytube/contrib/__pycache__/playlist.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d907c7b146a13659690c487bccee9bca1fd1861
Binary files /dev/null and b/pytube/contrib/__pycache__/playlist.cpython-39.pyc differ
diff --git a/pytube/contrib/__pycache__/search.cpython-310.pyc b/pytube/contrib/__pycache__/search.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36021f1e2502eebdb4370d7c9cd6213ee04558d8
Binary files /dev/null and b/pytube/contrib/__pycache__/search.cpython-310.pyc differ
diff --git a/pytube/contrib/__pycache__/search.cpython-39.pyc b/pytube/contrib/__pycache__/search.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc70f8efd2c342ca8587bc4ec55eda85fc32cadd
Binary files /dev/null and b/pytube/contrib/__pycache__/search.cpython-39.pyc differ
diff --git a/pytube/contrib/channel.py b/pytube/contrib/channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..147ff7eaa3c8d013a61ba02817c8400feb311c49
--- /dev/null
+++ b/pytube/contrib/channel.py
@@ -0,0 +1,201 @@
+# -*- coding: utf-8 -*-
+"""Module for interacting with a user's youtube channel."""
+import json
+import logging
+from typing import Dict, List, Optional, Tuple
+
+from pytube import extract, Playlist, request
+from pytube.helpers import uniqueify
+
+logger = logging.getLogger(__name__)
+
+
+class Channel(Playlist):
+ def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
+ """Construct a :class:`Channel `.
+
+ :param str url:
+ A valid YouTube channel URL.
+ :param proxies:
+ (Optional) A dictionary of proxies to use for web requests.
+ """
+ super().__init__(url, proxies)
+
+ self.channel_uri = extract.channel_name(url)
+
+ self.channel_url = (
+ f"https://www.youtube.com{self.channel_uri}"
+ )
+
+ self.videos_url = self.channel_url + '/videos'
+ self.playlists_url = self.channel_url + '/playlists'
+ self.community_url = self.channel_url + '/community'
+ self.featured_channels_url = self.channel_url + '/channels'
+ self.about_url = self.channel_url + '/about'
+
+ # Possible future additions
+ self._playlists_html = None
+ self._community_html = None
+ self._featured_channels_html = None
+ self._about_html = None
+
+ @property
+ def channel_name(self):
+ """Get the name of the YouTube channel.
+
+ :rtype: str
+ """
+ return self.initial_data['metadata']['channelMetadataRenderer']['title']
+
+ @property
+ def channel_id(self):
+ """Get the ID of the YouTube channel.
+
+ This will return the underlying ID, not the vanity URL.
+
+ :rtype: str
+ """
+ return self.initial_data['metadata']['channelMetadataRenderer']['externalId']
+
+ @property
+ def vanity_url(self):
+ """Get the vanity URL of the YouTube channel.
+
+ Returns None if it doesn't exist.
+
+ :rtype: str
+ """
+ return self.initial_data['metadata']['channelMetadataRenderer'].get('vanityChannelUrl', None) # noqa:E501
+
+ @property
+ def html(self):
+ """Get the html for the /videos page.
+
+ :rtype: str
+ """
+ if self._html:
+ return self._html
+ self._html = request.get(self.videos_url)
+ return self._html
+
+ @property
+ def playlists_html(self):
+ """Get the html for the /playlists page.
+
+ Currently unused for any functionality.
+
+ :rtype: str
+ """
+ if self._playlists_html:
+ return self._playlists_html
+ else:
+ self._playlists_html = request.get(self.playlists_url)
+ return self._playlists_html
+
+ @property
+ def community_html(self):
+ """Get the html for the /community page.
+
+ Currently unused for any functionality.
+
+ :rtype: str
+ """
+ if self._community_html:
+ return self._community_html
+ else:
+ self._community_html = request.get(self.community_url)
+ return self._community_html
+
+ @property
+ def featured_channels_html(self):
+ """Get the html for the /channels page.
+
+ Currently unused for any functionality.
+
+ :rtype: str
+ """
+ if self._featured_channels_html:
+ return self._featured_channels_html
+ else:
+ self._featured_channels_html = request.get(self.featured_channels_url)
+ return self._featured_channels_html
+
+ @property
+ def about_html(self):
+ """Get the html for the /about page.
+
+ Currently unused for any functionality.
+
+ :rtype: str
+ """
+ if self._about_html:
+ return self._about_html
+ else:
+ self._about_html = request.get(self.about_url)
+ return self._about_html
+
+ @staticmethod
+ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
+ """Extracts videos from a raw json page
+
+ :param str raw_json: Input json extracted from the page or the last
+ server response
+ :rtype: Tuple[List[str], Optional[str]]
+ :returns: Tuple containing a list of up to 100 video watch ids and
+ a continuation token, if more videos are available
+ """
+ initial_data = json.loads(raw_json)
+ # this is the json tree structure, if the json was extracted from
+ # html
+ try:
+ videos = initial_data["contents"][
+ "twoColumnBrowseResultsRenderer"][
+ "tabs"][1]["tabRenderer"]["content"][
+ "sectionListRenderer"]["contents"][0][
+ "itemSectionRenderer"]["contents"][0][
+ "gridRenderer"]["items"]
+ except (KeyError, IndexError, TypeError):
+ try:
+ # this is the json tree structure, if the json was directly sent
+ # by the server in a continuation response
+ important_content = initial_data[1]['response']['onResponseReceivedActions'][
+ 0
+ ]['appendContinuationItemsAction']['continuationItems']
+ videos = important_content
+ except (KeyError, IndexError, TypeError):
+ try:
+ # this is the json tree structure, if the json was directly sent
+ # by the server in a continuation response
+ # no longer a list and no longer has the "response" key
+ important_content = initial_data['onResponseReceivedActions'][0][
+ 'appendContinuationItemsAction']['continuationItems']
+ videos = important_content
+ except (KeyError, IndexError, TypeError) as p:
+ logger.info(p)
+ return [], None
+
+ try:
+ continuation = videos[-1]['continuationItemRenderer'][
+ 'continuationEndpoint'
+ ]['continuationCommand']['token']
+ videos = videos[:-1]
+ except (KeyError, IndexError):
+ # if there is an error, no continuation is available
+ continuation = None
+
+ # remove duplicates
+ return (
+ uniqueify(
+ list(
+ # only extract the video ids from the video data
+ map(
+ lambda x: (
+ f"/watch?v="
+ f"{x['gridVideoRenderer']['videoId']}"
+ ),
+ videos
+ )
+ ),
+ ),
+ continuation,
+ )
diff --git a/pytube/contrib/playlist.py b/pytube/contrib/playlist.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55f5e9dc5ec2f7023be75638c951780c46d8d19
--- /dev/null
+++ b/pytube/contrib/playlist.py
@@ -0,0 +1,419 @@
+"""Module to download a complete playlist from a youtube channel."""
+import json
+import logging
+from collections.abc import Sequence
+from datetime import date, datetime
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+from pytube import extract, request, YouTube
+from pytube.helpers import cache, DeferredGeneratorList, install_proxy, uniqueify
+
+logger = logging.getLogger(__name__)
+
+
+class Playlist(Sequence):
+ """Load a YouTube playlist with URL"""
+
+ def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None):
+ if proxies:
+ install_proxy(proxies)
+
+ self._input_url = url
+
+ # These need to be initialized as None for the properties.
+ self._html = None
+ self._ytcfg = None
+ self._initial_data = None
+ self._sidebar_info = None
+
+ self._playlist_id = None
+
+ @property
+ def playlist_id(self):
+ """Get the playlist id.
+
+ :rtype: str
+ """
+ if self._playlist_id:
+ return self._playlist_id
+ self._playlist_id = extract.playlist_id(self._input_url)
+ return self._playlist_id
+
+ @property
+ def playlist_url(self):
+ """Get the base playlist url.
+
+ :rtype: str
+ """
+ return f"https://www.youtube.com/playlist?list={self.playlist_id}"
+
+ @property
+ def html(self):
+ """Get the playlist page html.
+
+ :rtype: str
+ """
+ if self._html:
+ return self._html
+ self._html = request.get(self.playlist_url)
+ return self._html
+
+ @property
+ def ytcfg(self):
+ """Extract the ytcfg from the playlist page html.
+
+ :rtype: dict
+ """
+ if self._ytcfg:
+ return self._ytcfg
+ self._ytcfg = extract.get_ytcfg(self.html)
+ return self._ytcfg
+
+ @property
+ def initial_data(self):
+ """Extract the initial data from the playlist page html.
+
+ :rtype: dict
+ """
+ if self._initial_data:
+ return self._initial_data
+ else:
+ self._initial_data = extract.initial_data(self.html)
+ return self._initial_data
+
+ @property
+ def sidebar_info(self):
+ """Extract the sidebar info from the playlist page html.
+
+ :rtype: dict
+ """
+ if self._sidebar_info:
+ return self._sidebar_info
+ else:
+ self._sidebar_info = self.initial_data['sidebar'][
+ 'playlistSidebarRenderer']['items']
+ return self._sidebar_info
+
+ @property
+ def yt_api_key(self):
+ """Extract the INNERTUBE_API_KEY from the playlist ytcfg.
+
+ :rtype: str
+ """
+ return self.ytcfg['INNERTUBE_API_KEY']
+
+ def _paginate(
+ self, until_watch_id: Optional[str] = None
+ ) -> Iterable[List[str]]:
+ """Parse the video links from the page source, yields the /watch?v=
+ part from video link
+
+ :param until_watch_id Optional[str]: YouTube Video watch id until
+ which the playlist should be read.
+
+ :rtype: Iterable[List[str]]
+ :returns: Iterable of lists of YouTube watch ids
+ """
+ videos_urls, continuation = self._extract_videos(
+ json.dumps(extract.initial_data(self.html))
+ )
+ if until_watch_id:
+ try:
+ trim_index = videos_urls.index(f"/watch?v={until_watch_id}")
+ yield videos_urls[:trim_index]
+ return
+ except ValueError:
+ pass
+ yield videos_urls
+
+ # Extraction from a playlist only returns 100 videos at a time
+ # if self._extract_videos returns a continuation there are more
+ # than 100 songs inside a playlist, so we need to add further requests
+ # to gather all of them
+ if continuation:
+ load_more_url, headers, data = self._build_continuation_url(continuation)
+ else:
+ load_more_url, headers, data = None, None, None
+
+ while load_more_url and headers and data: # there is an url found
+ logger.debug("load more url: %s", load_more_url)
+ # requesting the next page of videos with the url generated from the
+ # previous page, needs to be a post
+ req = request.post(load_more_url, extra_headers=headers, data=data)
+ # extract up to 100 songs from the page loaded
+ # returns another continuation if more videos are available
+ videos_urls, continuation = self._extract_videos(req)
+ if until_watch_id:
+ try:
+ trim_index = videos_urls.index(f"/watch?v={until_watch_id}")
+ yield videos_urls[:trim_index]
+ return
+ except ValueError:
+ pass
+ yield videos_urls
+
+ if continuation:
+ load_more_url, headers, data = self._build_continuation_url(
+ continuation
+ )
+ else:
+ load_more_url, headers, data = None, None, None
+
+ def _build_continuation_url(self, continuation: str) -> Tuple[str, dict, dict]:
+ """Helper method to build the url and headers required to request
+ the next page of videos
+
+ :param str continuation: Continuation extracted from the json response
+ of the last page
+ :rtype: Tuple[str, dict, dict]
+ :returns: Tuple of an url and required headers for the next http
+ request
+ """
+ return (
+ (
+ # was changed to this format (and post requests)
+ # between 2021.03.02 and 2021.03.03
+ "https://www.youtube.com/youtubei/v1/browse?key="
+ f"{self.yt_api_key}"
+ ),
+ {
+ "X-YouTube-Client-Name": "1",
+ "X-YouTube-Client-Version": "2.20200720.00.02",
+ },
+ # extra data required for post request
+ {
+ "continuation": continuation,
+ "context": {
+ "client": {
+ "clientName": "WEB",
+ "clientVersion": "2.20200720.00.02"
+ }
+ }
+ }
+ )
+
+ @staticmethod
+ def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]:
+ """Extracts videos from a raw json page
+
+ :param str raw_json: Input json extracted from the page or the last
+ server response
+ :rtype: Tuple[List[str], Optional[str]]
+ :returns: Tuple containing a list of up to 100 video watch ids and
+ a continuation token, if more videos are available
+ """
+ initial_data = json.loads(raw_json)
+ try:
+ # this is the json tree structure, if the json was extracted from
+ # html
+ section_contents = initial_data["contents"][
+ "twoColumnBrowseResultsRenderer"][
+ "tabs"][0]["tabRenderer"]["content"][
+ "sectionListRenderer"]["contents"]
+ try:
+ # Playlist without submenus
+ important_content = section_contents[
+ 0]["itemSectionRenderer"][
+ "contents"][0]["playlistVideoListRenderer"]
+ except (KeyError, IndexError, TypeError):
+ # Playlist with submenus
+ important_content = section_contents[
+ 1]["itemSectionRenderer"][
+ "contents"][0]["playlistVideoListRenderer"]
+ videos = important_content["contents"]
+ except (KeyError, IndexError, TypeError):
+ try:
+ # this is the json tree structure, if the json was directly sent
+ # by the server in a continuation response
+ # no longer a list and no longer has the "response" key
+ important_content = initial_data['onResponseReceivedActions'][0][
+ 'appendContinuationItemsAction']['continuationItems']
+ videos = important_content
+ except (KeyError, IndexError, TypeError) as p:
+ logger.info(p)
+ return [], None
+
+ try:
+ continuation = videos[-1]['continuationItemRenderer'][
+ 'continuationEndpoint'
+ ]['continuationCommand']['token']
+ videos = videos[:-1]
+ except (KeyError, IndexError):
+ # if there is an error, no continuation is available
+ continuation = None
+
+ # remove duplicates
+ return (
+ uniqueify(
+ list(
+ # only extract the video ids from the video data
+ map(
+ lambda x: (
+ f"/watch?v="
+ f"{x['playlistVideoRenderer']['videoId']}"
+ ),
+ videos
+ )
+ ),
+ ),
+ continuation,
+ )
+
+ def trimmed(self, video_id: str) -> Iterable[str]:
+ """Retrieve a list of YouTube video URLs trimmed at the given video ID
+
+ i.e. if the playlist has video IDs 1,2,3,4 calling trimmed(3) returns
+ [1,2]
+ :type video_id: str
+ video ID to trim the returned list of playlist URLs at
+ :rtype: List[str]
+ :returns:
+ List of video URLs from the playlist trimmed at the given ID
+ """
+ for page in self._paginate(until_watch_id=video_id):
+ yield from (self._video_url(watch_path) for watch_path in page)
+
+ def url_generator(self):
+ """Generator that yields video URLs.
+
+ :Yields: Video URLs
+ """
+ for page in self._paginate():
+ for video in page:
+ yield self._video_url(video)
+
+ @property # type: ignore
+ @cache
+ def video_urls(self) -> DeferredGeneratorList:
+ """Complete links of all the videos in playlist
+
+ :rtype: List[str]
+ :returns: List of video URLs
+ """
+ return DeferredGeneratorList(self.url_generator())
+
+ def videos_generator(self):
+ for url in self.video_urls:
+ yield YouTube(url)
+
+ @property
+ def videos(self) -> Iterable[YouTube]:
+ """Yields YouTube objects of videos in this playlist
+
+ :rtype: List[YouTube]
+ :returns: List of YouTube
+ """
+ return DeferredGeneratorList(self.videos_generator())
+
+ def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]:
+ return self.video_urls[i]
+
+ def __len__(self) -> int:
+ return len(self.video_urls)
+
+ def __repr__(self) -> str:
+ return f"{repr(self.video_urls)}"
+
+ @property
+ @cache
+ def last_updated(self) -> Optional[date]:
+ """Extract the date that the playlist was last updated.
+
+ For some playlists, this will be a specific date, which is returned as a datetime
+ object. For other playlists, this is an estimate such as "1 week ago". Due to the
+ fact that this value is returned as a string, pytube does a best-effort parsing
+ where possible, and returns the raw string where it is not possible.
+
+ :return: Date of last playlist update where possible, else the string provided
+ :rtype: datetime.date
+ """
+ last_updated_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][
+ 'stats'][2]['runs'][1]['text']
+ try:
+ date_components = last_updated_text.split()
+ month = date_components[0]
+ day = date_components[1].strip(',')
+ year = date_components[2]
+ return datetime.strptime(
+ f"{month} {day:0>2} {year}", "%b %d %Y"
+ ).date()
+ except (IndexError, KeyError):
+ return last_updated_text
+
+ @property
+ @cache
+ def title(self) -> Optional[str]:
+ """Extract playlist title
+
+ :return: playlist title (name)
+ :rtype: Optional[str]
+ """
+ return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][
+ 'title']['runs'][0]['text']
+
+ @property
+ def description(self) -> str:
+ return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][
+ 'description']['simpleText']
+
+ @property
+ def length(self):
+ """Extract the number of videos in the playlist.
+
+ :return: Playlist video count
+ :rtype: int
+ """
+ count_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][
+ 'stats'][0]['runs'][0]['text']
+ count_text = count_text.replace(',','')
+ return int(count_text)
+
+ @property
+ def views(self):
+ """Extract view count for playlist.
+
+ :return: Playlist view count
+ :rtype: int
+ """
+ # "1,234,567 views"
+ views_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][
+ 'stats'][1]['simpleText']
+ # "1,234,567"
+ count_text = views_text.split()[0]
+ # "1234567"
+ count_text = count_text.replace(',', '')
+ return int(count_text)
+
+ @property
+ def owner(self):
+ """Extract the owner of the playlist.
+
+ :return: Playlist owner name.
+ :rtype: str
+ """
+ return self.sidebar_info[1]['playlistSidebarSecondaryInfoRenderer'][
+ 'videoOwner']['videoOwnerRenderer']['title']['runs'][0]['text']
+
+ @property
+ def owner_id(self):
+ """Extract the channel_id of the owner of the playlist.
+
+ :return: Playlist owner's channel ID.
+ :rtype: str
+ """
+ return self.sidebar_info[1]['playlistSidebarSecondaryInfoRenderer'][
+ 'videoOwner']['videoOwnerRenderer']['title']['runs'][0][
+ 'navigationEndpoint']['browseEndpoint']['browseId']
+
+ @property
+ def owner_url(self):
+ """Create the channel url of the owner of the playlist.
+
+ :return: Playlist owner's channel url.
+ :rtype: str
+ """
+ return f'https://www.youtube.com/channel/{self.owner_id}'
+
+ @staticmethod
+ def _video_url(watch_path: str):
+ return f"https://www.youtube.com{watch_path}"
diff --git a/pytube/contrib/search.py b/pytube/contrib/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..96982d80ed6291b9a526fd7329b802e8360ef708
--- /dev/null
+++ b/pytube/contrib/search.py
@@ -0,0 +1,225 @@
+"""Module for interacting with YouTube search."""
+# Native python imports
+import logging
+
+# Local imports
+from pytube import YouTube
+from pytube.innertube import InnerTube
+
+
+logger = logging.getLogger(__name__)
+
+
+class Search:
+ def __init__(self, query):
+ """Initialize Search object.
+
+ :param str query:
+ Search query provided by the user.
+ """
+ self.query = query
+ self._innertube_client = InnerTube(client='WEB')
+
+ # The first search, without a continuation, is structured differently
+ # and contains completion suggestions, so we must store this separately
+ self._initial_results = None
+
+ self._results = None
+ self._completion_suggestions = None
+
+ # Used for keeping track of query continuations so that new results
+ # are always returned when get_next_results() is called
+ self._current_continuation = None
+
+ @property
+ def completion_suggestions(self):
+ """Return query autocompletion suggestions for the query.
+
+ :rtype: list
+ :returns:
+ A list of autocomplete suggestions provided by YouTube for the query.
+ """
+ if self._completion_suggestions:
+ return self._completion_suggestions
+ if self.results:
+ self._completion_suggestions = self._initial_results['refinements']
+ return self._completion_suggestions
+
+ @property
+ def results(self):
+ """Return search results.
+
+ On first call, will generate and return the first set of results.
+ Additional results can be generated using ``.get_next_results()``.
+
+ :rtype: list
+ :returns:
+ A list of YouTube objects.
+ """
+ if self._results:
+ return self._results
+
+ videos, continuation = self.fetch_and_parse()
+ self._results = videos
+ self._current_continuation = continuation
+ return self._results
+
+ def get_next_results(self):
+ """Use the stored continuation string to fetch the next set of results.
+
+ This method does not return the results, but instead updates the results property.
+ """
+ if self._current_continuation:
+ videos, continuation = self.fetch_and_parse(self._current_continuation)
+ self._results.extend(videos)
+ self._current_continuation = continuation
+ else:
+ raise IndexError
+
+ def fetch_and_parse(self, continuation=None):
+ """Fetch from the innertube API and parse the results.
+
+ :param str continuation:
+ Continuation string for fetching results.
+ :rtype: tuple
+ :returns:
+ A tuple of a list of YouTube objects and a continuation string.
+ """
+ # Begin by executing the query and identifying the relevant sections
+ # of the results
+ raw_results = self.fetch_query(continuation)
+
+ # Initial result is handled by try block, continuations by except block
+ try:
+ sections = raw_results['contents']['twoColumnSearchResultsRenderer'][
+ 'primaryContents']['sectionListRenderer']['contents']
+ except KeyError:
+ sections = raw_results['onResponseReceivedCommands'][0][
+ 'appendContinuationItemsAction']['continuationItems']
+ item_renderer = None
+ continuation_renderer = None
+ for s in sections:
+ if 'itemSectionRenderer' in s:
+ item_renderer = s['itemSectionRenderer']
+ if 'continuationItemRenderer' in s:
+ continuation_renderer = s['continuationItemRenderer']
+
+ # If the continuationItemRenderer doesn't exist, assume no further results
+ if continuation_renderer:
+ next_continuation = continuation_renderer['continuationEndpoint'][
+ 'continuationCommand']['token']
+ else:
+ next_continuation = None
+
+ # If the itemSectionRenderer doesn't exist, assume no results.
+ if item_renderer:
+ videos = []
+ raw_video_list = item_renderer['contents']
+ for video_details in raw_video_list:
+ # Skip over ads
+ if video_details.get('searchPyvRenderer', {}).get('ads', None):
+ continue
+
+ # Skip "recommended" type videos e.g. "people also watched" and "popular X"
+ # that break up the search results
+ if 'shelfRenderer' in video_details:
+ continue
+
+ # Skip auto-generated "mix" playlist results
+ if 'radioRenderer' in video_details:
+ continue
+
+ # Skip playlist results
+ if 'playlistRenderer' in video_details:
+ continue
+
+ # Skip channel results
+ if 'channelRenderer' in video_details:
+ continue
+
+ # Skip 'people also searched for' results
+ if 'horizontalCardListRenderer' in video_details:
+ continue
+
+ # Can't seem to reproduce, probably related to typo fix suggestions
+ if 'didYouMeanRenderer' in video_details:
+ continue
+
+ # Seems to be the renderer used for the image shown on a no results page
+ if 'backgroundPromoRenderer' in video_details:
+ continue
+
+ if 'videoRenderer' not in video_details:
+ logger.warn('Unexpected renderer encountered.')
+ logger.warn(f'Renderer name: {video_details.keys()}')
+ logger.warn(f'Search term: {self.query}')
+ logger.warn(
+ 'Please open an issue at '
+ 'https://github.com/pytube/pytube/issues '
+ 'and provide this log output.'
+ )
+ continue
+
+ # Extract relevant video information from the details.
+ # Some of this can be used to pre-populate attributes of the
+ # YouTube object.
+ vid_renderer = video_details['videoRenderer']
+ vid_id = vid_renderer['videoId']
+ vid_url = f'https://www.youtube.com/watch?v={vid_id}'
+ vid_title = vid_renderer['title']['runs'][0]['text']
+ vid_channel_name = vid_renderer['ownerText']['runs'][0]['text']
+ vid_channel_uri = vid_renderer['ownerText']['runs'][0][
+ 'navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
+ # Livestreams have "runs", non-livestreams have "simpleText",
+ # and scheduled releases do not have 'viewCountText'
+ if 'viewCountText' in vid_renderer:
+ if 'runs' in vid_renderer['viewCountText']:
+ vid_view_count_text = vid_renderer['viewCountText']['runs'][0]['text']
+ else:
+ vid_view_count_text = vid_renderer['viewCountText']['simpleText']
+ # Strip ' views' text, then remove commas
+ stripped_text = vid_view_count_text.split()[0].replace(',','')
+ if stripped_text == 'No':
+ vid_view_count = 0
+ else:
+ vid_view_count = int(stripped_text)
+ else:
+ vid_view_count = 0
+ if 'lengthText' in vid_renderer:
+ vid_length = vid_renderer['lengthText']['simpleText']
+ else:
+ vid_length = None
+
+ vid_metadata = {
+ 'id': vid_id,
+ 'url': vid_url,
+ 'title': vid_title,
+ 'channel_name': vid_channel_name,
+ 'channel_url': vid_channel_uri,
+ 'view_count': vid_view_count,
+ 'length': vid_length
+ }
+
+ # Construct YouTube object from metadata and append to results
+ vid = YouTube(vid_metadata['url'])
+ vid.author = vid_metadata['channel_name']
+ vid.title = vid_metadata['title']
+ videos.append(vid)
+ else:
+ videos = None
+
+ return videos, next_continuation
+
+ def fetch_query(self, continuation=None):
+ """Fetch raw results from the innertube API.
+
+ :param str continuation:
+ Continuation string for fetching results.
+ :rtype: dict
+ :returns:
+ The raw json object returned by the innertube API.
+ """
+ query_results = self._innertube_client.search(self.query, continuation)
+ if not self._initial_results:
+ self._initial_results = query_results
+ return query_results # noqa:R504
diff --git a/pytube/exceptions.py b/pytube/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec44d2a12f7f82cbafbbfd717efae7f0644b7f33
--- /dev/null
+++ b/pytube/exceptions.py
@@ -0,0 +1,145 @@
+"""Library specific exception definitions."""
+from typing import Pattern, Union
+
+
+class PytubeError(Exception):
+ """Base pytube exception that all others inherit.
+
+ This is done to not pollute the built-in exceptions, which *could* result
+ in unintended errors being unexpectedly and incorrectly handled within
+ implementers code.
+ """
+
+
+class MaxRetriesExceeded(PytubeError):
+ """Maximum number of retries exceeded."""
+
+
+class HTMLParseError(PytubeError):
+ """HTML could not be parsed"""
+
+
+class ExtractError(PytubeError):
+ """Data extraction based exception."""
+
+
+class RegexMatchError(ExtractError):
+ """Regex pattern did not return any matches."""
+
+ def __init__(self, caller: str, pattern: Union[str, Pattern]):
+ """
+ :param str caller:
+ Calling function
+ :param str pattern:
+ Pattern that failed to match
+ """
+ super().__init__(f"{caller}: could not find match for {pattern}")
+ self.caller = caller
+ self.pattern = pattern
+
+
+class VideoUnavailable(PytubeError):
+ """Base video unavailable error."""
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.error_string)
+
+ @property
+ def error_string(self):
+ return f'{self.video_id} is unavailable'
+
+
+class AgeRestrictedError(VideoUnavailable):
+ """Video is age restricted, and cannot be accessed without OAuth."""
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.video_id)
+
+ @property
+ def error_string(self):
+ return f"{self.video_id} is age restricted, and can't be accessed without logging in."
+
+
+class LiveStreamError(VideoUnavailable):
+ """Video is a live stream."""
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.video_id)
+
+ @property
+ def error_string(self):
+ return f'{self.video_id} is streaming live and cannot be loaded'
+
+
+class VideoPrivate(VideoUnavailable):
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.video_id)
+
+ @property
+ def error_string(self):
+ return f'{self.video_id} is a private video'
+
+
+class RecordingUnavailable(VideoUnavailable):
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.video_id)
+
+ @property
+ def error_string(self):
+ return f'{self.video_id} does not have a live stream recording available'
+
+
+class MembersOnly(VideoUnavailable):
+ """Video is members-only.
+
+ YouTube has special videos that are only viewable to users who have
+ subscribed to a content creator.
+ ref: https://support.google.com/youtube/answer/7544492?hl=en
+ """
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.video_id)
+
+ @property
+ def error_string(self):
+ return f'{self.video_id} is a members-only video'
+
+
+class VideoRegionBlocked(VideoUnavailable):
+ def __init__(self, video_id: str):
+ """
+ :param str video_id:
+ A YouTube video identifier.
+ """
+ self.video_id = video_id
+ super().__init__(self.video_id)
+
+ @property
+ def error_string(self):
+ return f'{self.video_id} is not available in your region'
diff --git a/pytube/extract.py b/pytube/extract.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08321408694869020527423cd4d2812f43be58b
--- /dev/null
+++ b/pytube/extract.py
@@ -0,0 +1,579 @@
+"""This module contains all non-cipher related data extraction logic."""
+import logging
+import urllib.parse
+import re
+from collections import OrderedDict
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import parse_qs, quote, urlencode, urlparse
+
+from pytube.cipher import Cipher
+from pytube.exceptions import HTMLParseError, LiveStreamError, RegexMatchError
+from pytube.helpers import regex_search
+from pytube.metadata import YouTubeMetadata
+from pytube.parser import parse_for_object, parse_for_all_objects
+
+
+logger = logging.getLogger(__name__)
+
+
+def publish_date(watch_html: str):
+ """Extract publish date
+ :param str watch_html:
+ The html contents of the watch page.
+ :rtype: str
+ :returns:
+ Publish date of the video.
+ """
+ try:
+ result = regex_search(
+ r"(?<=itemprop=\"datePublished\" content=\")\d{4}-\d{2}-\d{2}",
+ watch_html, group=0
+ )
+ except RegexMatchError:
+ return None
+ return datetime.strptime(result, '%Y-%m-%d')
+
+
+def recording_available(watch_html):
+ """Check if live stream recording is available.
+
+ :param str watch_html:
+ The html contents of the watch page.
+ :rtype: bool
+ :returns:
+ Whether or not the content is private.
+ """
+ unavailable_strings = [
+ 'This live stream recording is not available.'
+ ]
+ for string in unavailable_strings:
+ if string in watch_html:
+ return False
+ return True
+
+
+def is_private(watch_html):
+ """Check if content is private.
+
+ :param str watch_html:
+ The html contents of the watch page.
+ :rtype: bool
+ :returns:
+ Whether or not the content is private.
+ """
+ private_strings = [
+ "This is a private video. Please sign in to verify that you may see it.",
+ "\"simpleText\":\"Private video\"",
+ "This video is private."
+ ]
+ for string in private_strings:
+ if string in watch_html:
+ return True
+ return False
+
+
+def is_age_restricted(watch_html: str) -> bool:
+ """Check if content is age restricted.
+
+ :param str watch_html:
+ The html contents of the watch page.
+ :rtype: bool
+ :returns:
+ Whether or not the content is age restricted.
+ """
+ try:
+ regex_search(r"og:restrictions:age", watch_html, group=0)
+ except RegexMatchError:
+ return False
+ return True
+
+
+def playability_status(watch_html: str) -> (str, str):
+ """Return the playability status and status explanation of a video.
+
+ For example, a video may have a status of LOGIN_REQUIRED, and an explanation
+ of "This is a private video. Please sign in to verify that you may see it."
+
+ This explanation is what gets incorporated into the media player overlay.
+
+ :param str watch_html:
+ The html contents of the watch page.
+ :rtype: bool
+ :returns:
+ Playability status and reason of the video.
+ """
+ player_response = initial_player_response(watch_html)
+ status_dict = player_response.get('playabilityStatus', {})
+ if 'liveStreamability' in status_dict:
+ return 'LIVE_STREAM', 'Video is a live stream.'
+ if 'status' in status_dict:
+ if 'reason' in status_dict:
+ return status_dict['status'], [status_dict['reason']]
+ if 'messages' in status_dict:
+ return status_dict['status'], status_dict['messages']
+ return None, [None]
+
+
+def video_id(url: str) -> str:
+ """Extract the ``video_id`` from a YouTube url.
+
+ This function supports the following patterns:
+
+ - :samp:`https://youtube.com/watch?v={video_id}`
+ - :samp:`https://youtube.com/embed/{video_id}`
+ - :samp:`https://youtu.be/{video_id}`
+
+ :param str url:
+ A YouTube url containing a video id.
+ :rtype: str
+ :returns:
+ YouTube video id.
+ """
+ return regex_search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url, group=1)
+
+
+def playlist_id(url: str) -> str:
+ """Extract the ``playlist_id`` from a YouTube url.
+
+ This function supports the following patterns:
+
+ - :samp:`https://youtube.com/playlist?list={playlist_id}`
+ - :samp:`https://youtube.com/watch?v={video_id}&list={playlist_id}`
+
+ :param str url:
+ A YouTube url containing a playlist id.
+ :rtype: str
+ :returns:
+ YouTube playlist id.
+ """
+ parsed = urllib.parse.urlparse(url)
+ return parse_qs(parsed.query)['list'][0]
+
+
+def channel_name(url: str) -> str:
+ """Extract the ``channel_name`` or ``channel_id`` from a YouTube url.
+
+ This function supports the following patterns:
+
+ - :samp:`https://youtube.com/c/{channel_name}/*`
+ - :samp:`https://youtube.com/channel/{channel_id}/*
+ - :samp:`https://youtube.com/u/{channel_name}/*`
+ - :samp:`https://youtube.com/user/{channel_id}/*
+
+ :param str url:
+ A YouTube url containing a channel name.
+ :rtype: str
+ :returns:
+ YouTube channel name.
+ """
+ patterns = [
+ r"(?:\/(c)\/([%\d\w_\-]+)(\/.*)?)",
+ r"(?:\/(channel)\/([%\w\d_\-]+)(\/.*)?)",
+ r"(?:\/(u)\/([%\d\w_\-]+)(\/.*)?)",
+ r"(?:\/(user)\/([%\w\d_\-]+)(\/.*)?)"
+ ]
+ for pattern in patterns:
+ regex = re.compile(pattern)
+ function_match = regex.search(url)
+ if function_match:
+ logger.debug("finished regex search, matched: %s", pattern)
+ uri_style = function_match.group(1)
+ uri_identifier = function_match.group(2)
+ return f'/{uri_style}/{uri_identifier}'
+
+ raise RegexMatchError(
+ caller="channel_name", pattern="patterns"
+ )
+
+
+def video_info_url(video_id: str, watch_url: str) -> str:
+ """Construct the video_info url.
+
+ :param str video_id:
+ A YouTube video identifier.
+ :param str watch_url:
+ A YouTube watch url.
+ :rtype: str
+ :returns:
+ :samp:`https://youtube.com/get_video_info` with necessary GET
+ parameters.
+ """
+ params = OrderedDict(
+ [
+ ("video_id", video_id),
+ ("ps", "default"),
+ ("eurl", quote(watch_url)),
+ ("hl", "en_US"),
+ ("html5", "1"),
+ ("c", "TVHTML5"),
+ ("cver", "7.20201028"),
+ ]
+ )
+ return _video_info_url(params)
+
+
+def video_info_url_age_restricted(video_id: str, embed_html: str) -> str:
+ """Construct the video_info url.
+
+ :param str video_id:
+ A YouTube video identifier.
+ :param str embed_html:
+ The html contents of the embed page (for age restricted videos).
+ :rtype: str
+ :returns:
+ :samp:`https://youtube.com/get_video_info` with necessary GET
+ parameters.
+ """
+ try:
+ sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1)
+ except RegexMatchError:
+ sts = ""
+ # Here we use ``OrderedDict`` so that the output is consistent between
+ # Python 2.7+.
+ eurl = f"https://youtube.googleapis.com/v/{video_id}"
+ params = OrderedDict(
+ [
+ ("video_id", video_id),
+ ("eurl", eurl),
+ ("sts", sts),
+ ("html5", "1"),
+ ("c", "TVHTML5"),
+ ("cver", "7.20201028"),
+ ]
+ )
+ return _video_info_url(params)
+
+
+def _video_info_url(params: OrderedDict) -> str:
+ return "https://www.youtube.com/get_video_info?" + urlencode(params)
+
+
+def js_url(html: str) -> str:
+ """Get the base JavaScript url.
+
+ Construct the base JavaScript url, which contains the decipher
+ "transforms".
+
+ :param str html:
+ The html contents of the watch page.
+ """
+ try:
+ base_js = get_ytplayer_config(html)['assets']['js']
+ except (KeyError, RegexMatchError):
+ base_js = get_ytplayer_js(html)
+ return "https://youtube.com" + base_js
+
+
+def mime_type_codec(mime_type_codec: str) -> Tuple[str, List[str]]:
+ """Parse the type data.
+
+ Breaks up the data in the ``type`` key of the manifest, which contains the
+ mime type and codecs serialized together, and splits them into separate
+ elements.
+
+ **Example**:
+
+ mime_type_codec('audio/webm; codecs="opus"') -> ('audio/webm', ['opus'])
+
+ :param str mime_type_codec:
+ String containing mime type and codecs.
+ :rtype: tuple
+ :returns:
+ The mime type and a list of codecs.
+
+ """
+ pattern = r"(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\""
+ regex = re.compile(pattern)
+ results = regex.search(mime_type_codec)
+ if not results:
+ raise RegexMatchError(caller="mime_type_codec", pattern=pattern)
+ mime_type, codecs = results.groups()
+ return mime_type, [c.strip() for c in codecs.split(",")]
+
+
+def get_ytplayer_js(html: str) -> Any:
+ """Get the YouTube player base JavaScript path.
+
+ :param str html
+ The html contents of the watch page.
+ :rtype: str
+ :returns:
+ Path to YouTube's base.js file.
+ """
+ js_url_patterns = [
+ r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)"
+ ]
+ for pattern in js_url_patterns:
+ regex = re.compile(pattern)
+ function_match = regex.search(html)
+ if function_match:
+ logger.debug("finished regex search, matched: %s", pattern)
+ yt_player_js = function_match.group(1)
+ return yt_player_js
+
+ raise RegexMatchError(
+ caller="get_ytplayer_js", pattern="js_url_patterns"
+ )
+
+
+def get_ytplayer_config(html: str) -> Any:
+ """Get the YouTube player configuration data from the watch html.
+
+ Extract the ``ytplayer_config``, which is json data embedded within the
+ watch html and serves as the primary source of obtaining the stream
+ manifest data.
+
+ :param str html:
+ The html contents of the watch page.
+ :rtype: str
+ :returns:
+ Substring of the html containing the encoded manifest data.
+ """
+ logger.debug("finding initial function name")
+ config_patterns = [
+ r"ytplayer\.config\s*=\s*",
+ r"ytInitialPlayerResponse\s*=\s*"
+ ]
+ for pattern in config_patterns:
+ # Try each pattern consecutively if they don't find a match
+ try:
+ return parse_for_object(html, pattern)
+ except HTMLParseError as e:
+ logger.debug(f'Pattern failed: {pattern}')
+ logger.debug(e)
+ continue
+
+ # setConfig() needs to be handled a little differently.
+ # We want to parse the entire argument to setConfig()
+ # and use then load that as json to find PLAYER_CONFIG
+ # inside of it.
+ setconfig_patterns = [
+ r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*"
+ ]
+ for pattern in setconfig_patterns:
+ # Try each pattern consecutively if they don't find a match
+ try:
+ return parse_for_object(html, pattern)
+ except HTMLParseError:
+ continue
+
+ raise RegexMatchError(
+ caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
+ )
+
+
+def get_ytcfg(html: str) -> str:
+ """Get the entirety of the ytcfg object.
+
+ This is built over multiple pieces, so we have to find all matches and
+ combine the dicts together.
+
+ :param str html:
+ The html contents of the watch page.
+ :rtype: str
+ :returns:
+ Substring of the html containing the encoded manifest data.
+ """
+ ytcfg = {}
+ ytcfg_patterns = [
+ r"ytcfg\s=\s",
+ r"ytcfg\.set\("
+ ]
+ for pattern in ytcfg_patterns:
+ # Try each pattern consecutively and try to build a cohesive object
+ try:
+ found_objects = parse_for_all_objects(html, pattern)
+ for obj in found_objects:
+ ytcfg.update(obj)
+ except HTMLParseError:
+ continue
+
+ if len(ytcfg) > 0:
+ return ytcfg
+
+ raise RegexMatchError(
+ caller="get_ytcfg", pattern="ytcfg_pattenrs"
+ )
+
+
+def apply_signature(stream_manifest: Dict, vid_info: Dict, js: str) -> None:
+ """Apply the decrypted signature to the stream manifest.
+
+ :param dict stream_manifest:
+ Details of the media streams available.
+ :param str js:
+ The contents of the base.js asset file.
+
+ """
+ cipher = Cipher(js=js)
+
+ for i, stream in enumerate(stream_manifest):
+ try:
+ url: str = stream["url"]
+ except KeyError:
+ live_stream = (
+ vid_info.get("playabilityStatus", {},)
+ .get("liveStreamability")
+ )
+ if live_stream:
+ raise LiveStreamError("UNKNOWN")
+ # 403 Forbidden fix.
+ if "signature" in url or (
+ "s" not in stream and ("&sig=" in url or "&lsig=" in url)
+ ):
+ # For certain videos, YouTube will just provide them pre-signed, in
+ # which case there's no real magic to download them and we can skip
+ # the whole signature descrambling entirely.
+ logger.debug("signature found, skip decipher")
+ continue
+
+ signature = cipher.get_signature(ciphered_signature=stream["s"])
+
+ logger.debug(
+ "finished descrambling signature for itag=%s", stream["itag"]
+ )
+ parsed_url = urlparse(url)
+
+ # Convert query params off url to dict
+ query_params = parse_qs(urlparse(url).query)
+ query_params = {
+ k: v[0] for k,v in query_params.items()
+ }
+ query_params['sig'] = signature
+ if 'ratebypass' not in query_params.keys():
+ # Cipher n to get the updated value
+
+ initial_n = list(query_params['n'])
+ new_n = cipher.calculate_n(initial_n)
+ query_params['n'] = new_n
+
+ url = f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}?{urlencode(query_params)}' # noqa:E501
+
+ # 403 forbidden fix
+ stream_manifest[i]["url"] = url
+
+
+def apply_descrambler(stream_data: Dict) -> None:
+ """Apply various in-place transforms to YouTube's media stream data.
+
+ Creates a ``list`` of dictionaries by string splitting on commas, then
+ taking each list item, parsing it as a query string, converting it to a
+ ``dict`` and unquoting the value.
+
+ :param dict stream_data:
+ Dictionary containing query string encoded values.
+
+ **Example**:
+
+ >>> d = {'foo': 'bar=1&var=test,em=5&t=url%20encoded'}
+ >>> apply_descrambler(d, 'foo')
+ >>> print(d)
+ {'foo': [{'bar': '1', 'var': 'test'}, {'em': '5', 't': 'url encoded'}]}
+
+ """
+ if 'url' in stream_data:
+ return None
+
+ # Merge formats and adaptiveFormats into a single list
+ formats = []
+ if 'formats' in stream_data.keys():
+ formats.extend(stream_data['formats'])
+ if 'adaptiveFormats' in stream_data.keys():
+ formats.extend(stream_data['adaptiveFormats'])
+
+ # Extract url and s from signatureCiphers as necessary
+ for data in formats:
+ if 'url' not in data:
+ if 'signatureCipher' in data:
+ cipher_url = parse_qs(data['signatureCipher'])
+ data['url'] = cipher_url['url'][0]
+ data['s'] = cipher_url['s'][0]
+ data['is_otf'] = data.get('type') == 'FORMAT_STREAM_TYPE_OTF'
+
+ logger.debug("applying descrambler")
+ return formats
+
+
+def initial_data(watch_html: str) -> str:
+ """Extract the ytInitialData json from the watch_html page.
+
+ This mostly contains metadata necessary for rendering the page on-load,
+ such as video information, copyright notices, etc.
+
+ @param watch_html: Html of the watch page
+ @return:
+ """
+ patterns = [
+ r"window\[['\"]ytInitialData['\"]]\s*=\s*",
+ r"ytInitialData\s*=\s*"
+ ]
+ for pattern in patterns:
+ try:
+ return parse_for_object(watch_html, pattern)
+ except HTMLParseError:
+ pass
+
+ raise RegexMatchError(caller='initial_data', pattern='initial_data_pattern')
+
+
+def initial_player_response(watch_html: str) -> str:
+ """Extract the ytInitialPlayerResponse json from the watch_html page.
+
+ This mostly contains metadata necessary for rendering the page on-load,
+ such as video information, copyright notices, etc.
+
+ @param watch_html: Html of the watch page
+ @return:
+ """
+ patterns = [
+ r"window\[['\"]ytInitialPlayerResponse['\"]]\s*=\s*",
+ r"ytInitialPlayerResponse\s*=\s*"
+ ]
+ for pattern in patterns:
+ try:
+ return parse_for_object(watch_html, pattern)
+ except HTMLParseError:
+ pass
+
+ raise RegexMatchError(
+ caller='initial_player_response',
+ pattern='initial_player_response_pattern'
+ )
+
+
+def metadata(initial_data) -> Optional[YouTubeMetadata]:
+ """Get the informational metadata for the video.
+
+ e.g.:
+ [
+ {
+ 'Song': '강남스타일(Gangnam Style)',
+ 'Artist': 'PSY',
+ 'Album': 'PSY SIX RULES Pt.1',
+ 'Licensed to YouTube by': 'YG Entertainment Inc. [...]'
+ }
+ ]
+
+ :rtype: YouTubeMetadata
+ """
+ try:
+ metadata_rows: List = initial_data["contents"]["twoColumnWatchNextResults"][
+ "results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"][
+ "metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
+ except (KeyError, IndexError):
+ # If there's an exception accessing this data, it probably doesn't exist.
+ return YouTubeMetadata([])
+
+ # Rows appear to only have "metadataRowRenderer" or "metadataRowHeaderRenderer"
+ # and we only care about the former, so we filter the others
+ metadata_rows = filter(
+ lambda x: "metadataRowRenderer" in x.keys(),
+ metadata_rows
+ )
+
+ # We then access the metadataRowRenderer key in each element
+ # and build a metadata object from this new list
+ metadata_rows = [x["metadataRowRenderer"] for x in metadata_rows]
+
+ return YouTubeMetadata(metadata_rows)
diff --git a/pytube/helpers.py b/pytube/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf02eb413b08aeca0ffe33112f30b51705119cb
--- /dev/null
+++ b/pytube/helpers.py
@@ -0,0 +1,335 @@
+"""Various helper functions implemented by pytube."""
+import functools
+import gzip
+import json
+import logging
+import os
+import re
+import warnings
+from typing import Any, Callable, Dict, List, Optional, TypeVar
+from urllib import request
+
+from pytube.exceptions import RegexMatchError
+
+logger = logging.getLogger(__name__)
+
+
+class DeferredGeneratorList:
+ """A wrapper class for deferring list generation.
+
+ Pytube has some continuation generators that create web calls, which means
+ that any time a full list is requested, all of those web calls must be
+ made at once, which could lead to slowdowns. This will allow individual
+ elements to be queried, so that slowdowns only happen as necessary. For
+ example, you can iterate over elements in the list without accessing them
+ all simultaneously. This should allow for speed improvements for playlist
+ and channel interactions.
+ """
+ def __init__(self, generator):
+ """Construct a :class:`DeferredGeneratorList `.
+
+ :param generator generator:
+ The deferrable generator to create a wrapper for.
+ :param func func:
+ (Optional) A function to call on the generator items to produce the list.
+ """
+ self.gen = generator
+ self._elements = []
+
+ def __eq__(self, other):
+ """We want to mimic list behavior for comparison."""
+ return list(self) == other
+
+ def __getitem__(self, key) -> Any:
+ """Only generate items as they're asked for."""
+ # We only allow querying with indexes.
+ if not isinstance(key, (int, slice)):
+ raise TypeError('Key must be either a slice or int.')
+
+ # Convert int keys to slice
+ key_slice = key
+ if isinstance(key, int):
+ key_slice = slice(key, key + 1, 1)
+
+ # Generate all elements up to the final item
+ while len(self._elements) < key_slice.stop:
+ try:
+ next_item = next(self.gen)
+ except StopIteration:
+ # If we can't find enough elements for the slice, raise an IndexError
+ raise IndexError
+ else:
+ self._elements.append(next_item)
+
+ return self._elements[key]
+
+ def __iter__(self):
+ """Custom iterator for dynamically generated list."""
+ iter_index = 0
+ while True:
+ try:
+ curr_item = self[iter_index]
+ except IndexError:
+ return
+ else:
+ yield curr_item
+ iter_index += 1
+
+ def __next__(self) -> Any:
+ """Fetch next element in iterator."""
+ try:
+ curr_element = self[self.iter_index]
+ except IndexError:
+ raise StopIteration
+ self.iter_index += 1
+ return curr_element # noqa:R504
+
+ def __len__(self) -> int:
+ """Return length of list of all items."""
+ self.generate_all()
+ return len(self._elements)
+
+ def __repr__(self) -> str:
+ """String representation of all items."""
+ self.generate_all()
+ return str(self._elements)
+
+ def __reversed__(self):
+ self.generate_all()
+ return self._elements[::-1]
+
+ def generate_all(self):
+ """Generate all items."""
+ while True:
+ try:
+ next_item = next(self.gen)
+ except StopIteration:
+ break
+ else:
+ self._elements.append(next_item)
+
+
+def regex_search(pattern: str, string: str, group: int) -> str:
+ """Shortcut method to search a string for a given pattern.
+
+ :param str pattern:
+ A regular expression pattern.
+ :param str string:
+ A target string to search.
+ :param int group:
+ Index of group to return.
+ :rtype:
+ str or tuple
+ :returns:
+ Substring pattern matches.
+ """
+ regex = re.compile(pattern)
+ results = regex.search(string)
+ if not results:
+ raise RegexMatchError(caller="regex_search", pattern=pattern)
+
+ logger.debug("matched regex search: %s", pattern)
+
+ return results.group(group)
+
+
+def safe_filename(s: str, max_length: int = 255) -> str:
+ """Sanitize a string making it safe to use as a filename.
+
+ This function was based off the limitations outlined here:
+ https://en.wikipedia.org/wiki/Filename.
+
+ :param str s:
+ A string to make safe for use as a file name.
+ :param int max_length:
+ The maximum filename character length.
+ :rtype: str
+ :returns:
+ A sanitized string.
+ """
+ # Characters in range 0-31 (0x00-0x1F) are not allowed in ntfs filenames.
+ ntfs_characters = [chr(i) for i in range(0, 31)]
+ characters = [
+ r'"',
+ r"\#",
+ r"\$",
+ r"\%",
+ r"'",
+ r"\*",
+ r"\,",
+ r"\.",
+ r"\/",
+ r"\:",
+ r'"',
+ r"\;",
+ r"\<",
+ r"\>",
+ r"\?",
+ r"\\",
+ r"\^",
+ r"\|",
+ r"\~",
+ r"\\\\",
+ ]
+ pattern = "|".join(ntfs_characters + characters)
+ regex = re.compile(pattern, re.UNICODE)
+ filename = regex.sub("", s)
+ return filename[:max_length].rsplit(" ", 0)[0]
+
+
+def setup_logger(level: int = logging.ERROR, log_filename: Optional[str] = None) -> None:
+ """Create a configured instance of logger.
+
+ :param int level:
+ Describe the severity level of the logs to handle.
+ """
+ fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
+ date_fmt = "%H:%M:%S"
+ formatter = logging.Formatter(fmt, datefmt=date_fmt)
+
+ # https://github.com/pytube/pytube/issues/163
+ logger = logging.getLogger("pytube")
+ logger.setLevel(level)
+
+ stream_handler = logging.StreamHandler()
+ stream_handler.setFormatter(formatter)
+ logger.addHandler(stream_handler)
+
+ if log_filename is not None:
+ file_handler = logging.FileHandler(log_filename)
+ file_handler.setFormatter(formatter)
+ logger.addHandler(file_handler)
+
+
+GenericType = TypeVar("GenericType")
+
+
+def cache(func: Callable[..., GenericType]) -> GenericType:
+ """ mypy compatible annotation wrapper for lru_cache"""
+ return functools.lru_cache()(func) # type: ignore
+
+
+def deprecated(reason: str) -> Callable:
+ """
+ This is a decorator which can be used to mark functions
+ as deprecated. It will result in a warning being emitted
+ when the function is used.
+ """
+
+ def decorator(func1):
+ message = "Call to deprecated function {name} ({reason})."
+
+ @functools.wraps(func1)
+ def new_func1(*args, **kwargs):
+ warnings.simplefilter("always", DeprecationWarning)
+ warnings.warn(
+ message.format(name=func1.__name__, reason=reason),
+ category=DeprecationWarning,
+ stacklevel=2,
+ )
+ warnings.simplefilter("default", DeprecationWarning)
+ return func1(*args, **kwargs)
+
+ return new_func1
+
+ return decorator
+
+
+def target_directory(output_path: Optional[str] = None) -> str:
+ """
+ Function for determining target directory of a download.
+ Returns an absolute path (if relative one given) or the current
+ path (if none given). Makes directory if it does not exist.
+
+ :type output_path: str
+ :rtype: str
+ :returns:
+ An absolute directory path as a string.
+ """
+ if output_path:
+ if not os.path.isabs(output_path):
+ output_path = os.path.join(os.getcwd(), output_path)
+ else:
+ output_path = os.getcwd()
+ os.makedirs(output_path, exist_ok=True)
+ return output_path
+
+
+def install_proxy(proxy_handler: Dict[str, str]) -> None:
+ proxy_support = request.ProxyHandler(proxy_handler)
+ opener = request.build_opener(proxy_support)
+ request.install_opener(opener)
+
+
+def uniqueify(duped_list: List) -> List:
+ """Remove duplicate items from a list, while maintaining list order.
+
+ :param List duped_list
+ List to remove duplicates from
+
+ :return List result
+ De-duplicated list
+ """
+ seen: Dict[Any, bool] = {}
+ result = []
+ for item in duped_list:
+ if item in seen:
+ continue
+ seen[item] = True
+ result.append(item)
+ return result
+
+
+def generate_all_html_json_mocks():
+ """Regenerate the video mock json files for all current test videos.
+
+ This should automatically output to the test/mocks directory.
+ """
+ test_vid_ids = [
+ '2lAe1cqCOXo',
+ '5YceQ8YqYMc',
+ 'irauhITDrsE',
+ 'm8uHb5jIGN8',
+ 'QRS8MkLhQmM',
+ 'WXxV9g7lsFE'
+ ]
+ for vid_id in test_vid_ids:
+ create_mock_html_json(vid_id)
+
+
+def create_mock_html_json(vid_id) -> Dict[str, Any]:
+ """Generate a json.gz file with sample html responses.
+
+ :param str vid_id
+ YouTube video id
+
+ :return dict data
+ Dict used to generate the json.gz file
+ """
+ from pytube import YouTube
+ gzip_filename = 'yt-video-%s-html.json.gz' % vid_id
+
+ # Get the pytube directory in order to navigate to /tests/mocks
+ pytube_dir_path = os.path.abspath(
+ os.path.join(
+ os.path.dirname(__file__),
+ os.path.pardir
+ )
+ )
+ pytube_mocks_path = os.path.join(pytube_dir_path, 'tests', 'mocks')
+ gzip_filepath = os.path.join(pytube_mocks_path, gzip_filename)
+
+ yt = YouTube(f'https://www.youtube.com/watch?v={vid_id}')
+ html_data = {
+ 'url': yt.watch_url,
+ 'js': yt.js,
+ 'embed_html': yt.embed_html,
+ 'watch_html': yt.watch_html,
+ 'vid_info': yt.vid_info
+ }
+
+ logger.info(f'Outputing json.gz file to {gzip_filepath}')
+ with gzip.open(gzip_filepath, 'wb') as f:
+ f.write(json.dumps(html_data).encode('utf-8'))
+
+ return html_data
diff --git a/pytube/innertube.py b/pytube/innertube.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d940a05d2618a617b233f8f966da25c1b83699
--- /dev/null
+++ b/pytube/innertube.py
@@ -0,0 +1,359 @@
+"""This module is designed to interact with the innertube API.
+
+This module is NOT intended to be used directly by end users, as each of the
+interfaces returns raw results. These should instead be parsed to extract
+the useful information for the end user.
+"""
+# Native python imports
+import json
+import os
+import pathlib
+import time
+from urllib import parse
+
+# Local imports
+from pytube import request
+
+# YouTube on TV client secrets
+_client_id = '861556708454-d6dlm3lh05idd8npek18k6be8ba3oc68.apps.googleusercontent.com'
+_client_secret = 'SboVhoG9s0rNafixCSGGKXAT'
+
+# Extracted API keys -- unclear what these are linked to.
+_api_keys = [
+ 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'AIzaSyCtkvNIR1HCEwzsqK6JuE6KqpyjusIRI30',
+ 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w',
+ 'AIzaSyC8UYZpvA2eknNex0Pjid0_eTLJoDu6los',
+ 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw',
+ 'AIzaSyDHQ9ipnphqTzDqZsbtd8_Ru4_kiKVQe2k'
+]
+
+_default_clients = {
+ 'WEB': {
+ 'context': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20200720.00.02'
+ }
+ },
+ 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ },
+ 'ANDROID': {
+ 'context': {
+ 'client': {
+ 'clientName': 'ANDROID',
+ 'clientVersion': '16.20'
+ }
+ },
+ 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ },
+ 'WEB_EMBED': {
+ 'context': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20210721.00.00',
+ 'clientScreen': 'EMBED'
+ }
+ },
+ 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ },
+ 'ANDROID_EMBED': {
+ 'context': {
+ 'client': {
+ 'clientName': 'ANDROID',
+ 'clientVersion': '16.20',
+ 'clientScreen': 'EMBED'
+ }
+ },
+ 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ }
+}
+_token_timeout = 1800
+_cache_dir = pathlib.Path(__file__).parent.resolve() / '__cache__'
+_token_file = os.path.join(_cache_dir, 'tokens.json')
+
+
+class InnerTube:
+ """Object for interacting with the innertube API."""
+ def __init__(self, client='ANDROID', use_oauth=False, allow_cache=True):
+ """Initialize an InnerTube object.
+
+ :param str client:
+ Client to use for the object.
+ Default to web because it returns the most playback types.
+ :param bool use_oauth:
+ Whether or not to authenticate to YouTube.
+ :param bool allow_cache:
+ Allows caching of oauth tokens on the machine.
+ """
+ self.context = _default_clients[client]['context']
+ self.api_key = _default_clients[client]['api_key']
+ self.access_token = None
+ self.refresh_token = None
+ self.use_oauth = use_oauth
+ self.allow_cache = allow_cache
+
+ # Stored as epoch time
+ self.expires = None
+
+ # Try to load from file if specified
+ if self.use_oauth and self.allow_cache:
+ # Try to load from file if possible
+ if os.path.exists(_token_file):
+ with open(_token_file) as f:
+ data = json.load(f)
+ self.access_token = data['access_token']
+ self.refresh_token = data['refresh_token']
+ self.expires = data['expires']
+ self.refresh_bearer_token()
+
+ def cache_tokens(self):
+ """Cache tokens to file if allowed."""
+ if not self.allow_cache:
+ return
+
+ data = {
+ 'access_token': self.access_token,
+ 'refresh_token': self.refresh_token,
+ 'expires': self.expires
+ }
+ if not os.path.exists(_cache_dir):
+ os.mkdir(_cache_dir)
+ with open(_token_file, 'w') as f:
+ json.dump(data, f)
+
+ def refresh_bearer_token(self, force=False):
+ """Refreshes the OAuth token if necessary.
+
+ :param bool force:
+ Force-refresh the bearer token.
+ """
+ if not self.use_oauth:
+ return
+ # Skip refresh if it's not necessary and not forced
+ if self.expires > time.time() and not force:
+ return
+
+ # Subtracting 30 seconds is arbitrary to avoid potential time discrepencies
+ start_time = int(time.time() - 30)
+ data = {
+ 'client_id': _client_id,
+ 'client_secret': _client_secret,
+ 'grant_type': 'refresh_token',
+ 'refresh_token': self.refresh_token
+ }
+ response = request._execute_request(
+ 'https://oauth2.googleapis.com/token',
+ 'POST',
+ headers={
+ 'Content-Type': 'application/json'
+ },
+ data=data
+ )
+ response_data = json.loads(response.read())
+
+ self.access_token = response_data['access_token']
+ self.expires = start_time + response_data['expires_in']
+ self.cache_tokens()
+
+ def fetch_bearer_token(self):
+ """Fetch an OAuth token."""
+ # Subtracting 30 seconds is arbitrary to avoid potential time discrepencies
+ start_time = int(time.time() - 30)
+ data = {
+ 'client_id': _client_id,
+ 'scope': 'https://www.googleapis.com/auth/youtube'
+ }
+ response = request._execute_request(
+ 'https://oauth2.googleapis.com/device/code',
+ 'POST',
+ headers={
+ 'Content-Type': 'application/json'
+ },
+ data=data
+ )
+ response_data = json.loads(response.read())
+ verification_url = response_data['verification_url']
+ user_code = response_data['user_code']
+ print(f'Please open {verification_url} and input code {user_code}')
+ input('Press enter when you have completed this step.')
+
+ data = {
+ 'client_id': _client_id,
+ 'client_secret': _client_secret,
+ 'device_code': response_data['device_code'],
+ 'grant_type': 'urn:ietf:params:oauth:grant-type:device_code'
+ }
+ response = request._execute_request(
+ 'https://oauth2.googleapis.com/token',
+ 'POST',
+ headers={
+ 'Content-Type': 'application/json'
+ },
+ data=data
+ )
+ response_data = json.loads(response.read())
+
+ self.access_token = response_data['access_token']
+ self.refresh_token = response_data['refresh_token']
+ self.expires = start_time + response_data['expires_in']
+ self.cache_tokens()
+
+ @property
+ def base_url(self):
+ """Return the base url endpoint for the innertube API."""
+ return 'https://www.youtube.com/youtubei/v1'
+
+ @property
+ def base_data(self):
+ """Return the base json data to transmit to the innertube API."""
+ return {
+ 'context': self.context
+ }
+
+ @property
+ def base_params(self):
+ """Return the base query parameters to transmit to the innertube API."""
+ return {
+ 'key': self.api_key,
+ 'contentCheckOk': True,
+ 'racyCheckOk': True
+ }
+
+ def _call_api(self, endpoint, query, data):
+ """Make a request to a given endpoint with the provided query parameters and data."""
+ # Remove the API key if oauth is being used.
+ if self.use_oauth:
+ del query['key']
+
+ endpoint_url = f'{endpoint}?{parse.urlencode(query)}'
+ headers = {
+ 'Content-Type': 'application/json',
+ }
+ # Add the bearer token if applicable
+ if self.use_oauth:
+ if self.access_token:
+ self.refresh_bearer_token()
+ headers['Authorization'] = f'Bearer {self.access_token}'
+ else:
+ self.fetch_bearer_token()
+ headers['Authorization'] = f'Bearer {self.access_token}'
+
+ response = request._execute_request(
+ endpoint_url,
+ 'POST',
+ headers=headers,
+ data=data
+ )
+ return json.loads(response.read())
+
+ def browse(self):
+ """Make a request to the browse endpoint.
+
+ TODO: Figure out how we can use this
+ """
+ # endpoint = f'{self.base_url}/browse' # noqa:E800
+ ...
+ # return self._call_api(endpoint, query, self.base_data) # noqa:E800
+
+ def config(self):
+ """Make a request to the config endpoint.
+
+ TODO: Figure out how we can use this
+ """
+ # endpoint = f'{self.base_url}/config' # noqa:E800
+ ...
+ # return self._call_api(endpoint, query, self.base_data) # noqa:E800
+
+ def guide(self):
+ """Make a request to the guide endpoint.
+
+ TODO: Figure out how we can use this
+ """
+ # endpoint = f'{self.base_url}/guide' # noqa:E800
+ ...
+ # return self._call_api(endpoint, query, self.base_data) # noqa:E800
+
+ def next(self):
+ """Make a request to the next endpoint.
+
+ TODO: Figure out how we can use this
+ """
+ # endpoint = f'{self.base_url}/next' # noqa:E800
+ ...
+ # return self._call_api(endpoint, query, self.base_data) # noqa:E800
+
+ def player(self, video_id):
+ """Make a request to the player endpoint.
+
+ :param str video_id:
+ The video id to get player info for.
+ :rtype: dict
+ :returns:
+ Raw player info results.
+ """
+ endpoint = f'{self.base_url}/player'
+ query = {
+ 'videoId': video_id,
+ }
+ query.update(self.base_params)
+ return self._call_api(endpoint, query, self.base_data)
+
+ def search(self, search_query, continuation=None):
+ """Make a request to the search endpoint.
+
+ :param str search_query:
+ The query to search.
+ :rtype: dict
+ :returns:
+ Raw search query results.
+ """
+ endpoint = f'{self.base_url}/search'
+ query = {
+ 'query': search_query
+ }
+ query.update(self.base_params)
+ data = {}
+ if continuation:
+ data['continuation'] = continuation
+ data.update(self.base_data)
+ return self._call_api(endpoint, query, data)
+
+ def verify_age(self, video_id):
+ """Make a request to the age_verify endpoint.
+
+ Notable examples of the types of video this verification step is for:
+ * https://www.youtube.com/watch?v=QLdAhwSBZ3w
+ * https://www.youtube.com/watch?v=hc0ZDaAZQT0
+
+ :param str video_id:
+ The video id to get player info for.
+ :rtype: dict
+ :returns:
+ Returns information that includes a URL for bypassing certain restrictions.
+ """
+ endpoint = f'{self.base_url}/verify_age'
+ data = {
+ 'nextEndpoint': {
+ 'urlEndpoint': {
+ 'url': f'/watch?v={video_id}'
+ }
+ },
+ 'setControvercy': True
+ }
+ data.update(self.base_data)
+ result = self._call_api(endpoint, self.base_params, data)
+ return result
+
+ def get_transcript(self, video_id):
+ """Make a request to the get_transcript endpoint.
+
+ This is likely related to captioning for videos, but is currently untested.
+ """
+ endpoint = f'{self.base_url}/get_transcript'
+ query = {
+ 'videoId': video_id,
+ }
+ query.update(self.base_params)
+ result = self._call_api(endpoint, query, self.base_data)
+ return result
diff --git a/pytube/itags.py b/pytube/itags.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f23cae8a7e5d9d8903671708174cc8428b517bb
--- /dev/null
+++ b/pytube/itags.py
@@ -0,0 +1,144 @@
+"""This module contains a lookup table of YouTube's itag values."""
+from typing import Dict
+
+PROGRESSIVE_VIDEO = {
+ 5: ("240p", "64kbps"),
+ 6: ("270p", "64kbps"),
+ 13: ("144p", None),
+ 17: ("144p", "24kbps"),
+ 18: ("360p", "96kbps"),
+ 22: ("720p", "192kbps"),
+ 34: ("360p", "128kbps"),
+ 35: ("480p", "128kbps"),
+ 36: ("240p", None),
+ 37: ("1080p", "192kbps"),
+ 38: ("3072p", "192kbps"),
+ 43: ("360p", "128kbps"),
+ 44: ("480p", "128kbps"),
+ 45: ("720p", "192kbps"),
+ 46: ("1080p", "192kbps"),
+ 59: ("480p", "128kbps"),
+ 78: ("480p", "128kbps"),
+ 82: ("360p", "128kbps"),
+ 83: ("480p", "128kbps"),
+ 84: ("720p", "192kbps"),
+ 85: ("1080p", "192kbps"),
+ 91: ("144p", "48kbps"),
+ 92: ("240p", "48kbps"),
+ 93: ("360p", "128kbps"),
+ 94: ("480p", "128kbps"),
+ 95: ("720p", "256kbps"),
+ 96: ("1080p", "256kbps"),
+ 100: ("360p", "128kbps"),
+ 101: ("480p", "192kbps"),
+ 102: ("720p", "192kbps"),
+ 132: ("240p", "48kbps"),
+ 151: ("720p", "24kbps"),
+ 300: ("720p", "128kbps"),
+ 301: ("1080p", "128kbps"),
+}
+
+DASH_VIDEO = {
+ # DASH Video
+ 133: ("240p", None), # MP4
+ 134: ("360p", None), # MP4
+ 135: ("480p", None), # MP4
+ 136: ("720p", None), # MP4
+ 137: ("1080p", None), # MP4
+ 138: ("2160p", None), # MP4
+ 160: ("144p", None), # MP4
+ 167: ("360p", None), # WEBM
+ 168: ("480p", None), # WEBM
+ 169: ("720p", None), # WEBM
+ 170: ("1080p", None), # WEBM
+ 212: ("480p", None), # MP4
+ 218: ("480p", None), # WEBM
+ 219: ("480p", None), # WEBM
+ 242: ("240p", None), # WEBM
+ 243: ("360p", None), # WEBM
+ 244: ("480p", None), # WEBM
+ 245: ("480p", None), # WEBM
+ 246: ("480p", None), # WEBM
+ 247: ("720p", None), # WEBM
+ 248: ("1080p", None), # WEBM
+ 264: ("1440p", None), # MP4
+ 266: ("2160p", None), # MP4
+ 271: ("1440p", None), # WEBM
+ 272: ("4320p", None), # WEBM
+ 278: ("144p", None), # WEBM
+ 298: ("720p", None), # MP4
+ 299: ("1080p", None), # MP4
+ 302: ("720p", None), # WEBM
+ 303: ("1080p", None), # WEBM
+ 308: ("1440p", None), # WEBM
+ 313: ("2160p", None), # WEBM
+ 315: ("2160p", None), # WEBM
+ 330: ("144p", None), # WEBM
+ 331: ("240p", None), # WEBM
+ 332: ("360p", None), # WEBM
+ 333: ("480p", None), # WEBM
+ 334: ("720p", None), # WEBM
+ 335: ("1080p", None), # WEBM
+ 336: ("1440p", None), # WEBM
+ 337: ("2160p", None), # WEBM
+ 394: ("144p", None), # MP4
+ 395: ("240p", None), # MP4
+ 396: ("360p", None), # MP4
+ 397: ("480p", None), # MP4
+ 398: ("720p", None), # MP4
+ 399: ("1080p", None), # MP4
+ 400: ("1440p", None), # MP4
+ 401: ("2160p", None), # MP4
+ 402: ("4320p", None), # MP4
+ 571: ("4320p", None), # MP4
+}
+
+DASH_AUDIO = {
+ # DASH Audio
+ 139: (None, "48kbps"), # MP4
+ 140: (None, "128kbps"), # MP4
+ 141: (None, "256kbps"), # MP4
+ 171: (None, "128kbps"), # WEBM
+ 172: (None, "256kbps"), # WEBM
+ 249: (None, "50kbps"), # WEBM
+ 250: (None, "70kbps"), # WEBM
+ 251: (None, "160kbps"), # WEBM
+ 256: (None, "192kbps"), # MP4
+ 258: (None, "384kbps"), # MP4
+ 325: (None, None), # MP4
+ 328: (None, None), # MP4
+}
+
+ITAGS = {
+ **PROGRESSIVE_VIDEO,
+ **DASH_VIDEO,
+ **DASH_AUDIO,
+}
+
+HDR = [330, 331, 332, 333, 334, 335, 336, 337]
+_3D = [82, 83, 84, 85, 100, 101, 102]
+LIVE = [91, 92, 93, 94, 95, 96, 132, 151]
+
+
+def get_format_profile(itag: int) -> Dict:
+ """Get additional format information for a given itag.
+
+ :param str itag:
+ YouTube format identifier code.
+ """
+ itag = int(itag)
+ if itag in ITAGS:
+ res, bitrate = ITAGS[itag]
+ else:
+ res, bitrate = None, None
+ return {
+ "resolution": res,
+ "abr": bitrate,
+ "is_live": itag in LIVE,
+ "is_3d": itag in _3D,
+ "is_hdr": itag in HDR,
+ "is_dash": (
+ itag in DASH_AUDIO
+ or itag in DASH_VIDEO
+ ),
+ }
diff --git a/pytube/metadata.py b/pytube/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..be12c632312a8afb8fdbad91200b95238cb49371
--- /dev/null
+++ b/pytube/metadata.py
@@ -0,0 +1,48 @@
+"""This module contains the YouTubeMetadata class."""
+import json
+from typing import Dict, List, Optional
+
+
+class YouTubeMetadata:
+ def __init__(self, metadata: List):
+ self._raw_metadata: List = metadata
+ self._metadata = [{}]
+
+ for el in metadata:
+ # We only add metadata to the dict if it has a simpleText title.
+ if 'title' in el and 'simpleText' in el['title']:
+ metadata_title = el['title']['simpleText']
+ else:
+ continue
+
+ contents = el['contents'][0]
+ if 'simpleText' in contents:
+ self._metadata[-1][metadata_title] = contents['simpleText']
+ elif 'runs' in contents:
+ self._metadata[-1][metadata_title] = contents['runs'][0]['text']
+
+ # Upon reaching a dividing line, create a new grouping
+ if el.get('hasDividerLine', False):
+ self._metadata.append({})
+
+ # If we happen to create an empty dict at the end, drop it
+ if self._metadata[-1] == {}:
+ self._metadata = self._metadata[:-1]
+
+ def __getitem__(self, key):
+ return self._metadata[key]
+
+ def __iter__(self):
+ for el in self._metadata:
+ yield el
+
+ def __str__(self):
+ return json.dumps(self._metadata)
+
+ @property
+ def raw_metadata(self) -> Optional[Dict]:
+ return self._raw_metadata
+
+ @property
+ def metadata(self):
+ return self._metadata
diff --git a/pytube/monostate.py b/pytube/monostate.py
new file mode 100644
index 0000000000000000000000000000000000000000..7968af5fceeb1fcad31cba1fccbcb6b134ab2577
--- /dev/null
+++ b/pytube/monostate.py
@@ -0,0 +1,15 @@
+from typing import Any, Callable, Optional
+
+
+class Monostate:
+ def __init__(
+ self,
+ on_progress: Optional[Callable[[Any, bytes, int], None]],
+ on_complete: Optional[Callable[[Any, Optional[str]], None]],
+ title: Optional[str] = None,
+ duration: Optional[int] = None,
+ ):
+ self.on_progress = on_progress
+ self.on_complete = on_complete
+ self.title = title
+ self.duration = duration
diff --git a/pytube/parser.py b/pytube/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..535a4b17789b34c02167043c3da1021335266950
--- /dev/null
+++ b/pytube/parser.py
@@ -0,0 +1,178 @@
+import ast
+import json
+import re
+from pytube.exceptions import HTMLParseError
+
+
+def parse_for_all_objects(html, preceding_regex):
+ """Parses input html to find all matches for the input starting point.
+
+ :param str html:
+ HTML to be parsed for an object.
+ :param str preceding_regex:
+ Regex to find the string preceding the object.
+ :rtype list:
+ :returns:
+ A list of dicts created from parsing the objects.
+ """
+ result = []
+ regex = re.compile(preceding_regex)
+ match_iter = regex.finditer(html)
+ for match in match_iter:
+ if match:
+ start_index = match.end()
+ try:
+ obj = parse_for_object_from_startpoint(html, start_index)
+ except HTMLParseError:
+ # Some of the instances might fail because set is technically
+ # a method of the ytcfg object. We'll skip these since they
+ # don't seem relevant at the moment.
+ continue
+ else:
+ result.append(obj)
+
+ if len(result) == 0:
+ raise HTMLParseError(f'No matches for regex {preceding_regex}')
+
+ return result
+
+
+def parse_for_object(html, preceding_regex):
+ """Parses input html to find the end of a JavaScript object.
+
+ :param str html:
+ HTML to be parsed for an object.
+ :param str preceding_regex:
+ Regex to find the string preceding the object.
+ :rtype dict:
+ :returns:
+ A dict created from parsing the object.
+ """
+ regex = re.compile(preceding_regex)
+ result = regex.search(html)
+ if not result:
+ raise HTMLParseError(f'No matches for regex {preceding_regex}')
+
+ start_index = result.end()
+ return parse_for_object_from_startpoint(html, start_index)
+
+
+def find_object_from_startpoint(html, start_point):
+ """Parses input html to find the end of a JavaScript object.
+
+ :param str html:
+ HTML to be parsed for an object.
+ :param int start_point:
+ Index of where the object starts.
+ :rtype dict:
+ :returns:
+ A dict created from parsing the object.
+ """
+ html = html[start_point:]
+ if html[0] not in ['{','[']:
+ raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}')
+
+ # First letter MUST be a open brace, so we put that in the stack,
+ # and skip the first character.
+ stack = [html[0]]
+ i = 1
+
+ context_closers = {
+ '{': '}',
+ '[': ']',
+ '"': '"'
+ }
+
+ while i < len(html):
+ if len(stack) == 0:
+ break
+ curr_char = html[i]
+ curr_context = stack[-1]
+
+ # If we've reached a context closer, we can remove an element off the stack
+ if curr_char == context_closers[curr_context]:
+ stack.pop()
+ i += 1
+ continue
+
+ # Strings require special context handling because they can contain
+ # context openers *and* closers
+ if curr_context == '"':
+ # If there's a backslash in a string, we skip a character
+ if curr_char == '\\':
+ i += 2
+ continue
+ else:
+ # Non-string contexts are when we need to look for context openers.
+ if curr_char in context_closers.keys():
+ stack.append(curr_char)
+
+ i += 1
+
+ full_obj = html[:i]
+ return full_obj # noqa: R504
+
+
+def parse_for_object_from_startpoint(html, start_point):
+ """JSONifies an object parsed from HTML.
+
+ :param str html:
+ HTML to be parsed for an object.
+ :param int start_point:
+ Index of where the object starts.
+ :rtype dict:
+ :returns:
+ A dict created from parsing the object.
+ """
+ full_obj = find_object_from_startpoint(html, start_point)
+ try:
+ return json.loads(full_obj)
+ except json.decoder.JSONDecodeError:
+ try:
+ return ast.literal_eval(full_obj)
+ except (ValueError, SyntaxError):
+ raise HTMLParseError('Could not parse object.')
+
+
+def throttling_array_split(js_array):
+ """Parses the throttling array into a python list of strings.
+
+ Expects input to begin with `[` and close with `]`.
+
+ :param str js_array:
+ The javascript array, as a string.
+ :rtype: list:
+ :returns:
+ A list of strings representing splits on `,` in the throttling array.
+ """
+ results = []
+ curr_substring = js_array[1:]
+
+ comma_regex = re.compile(r",")
+ func_regex = re.compile(r"function\([^)]*\)")
+
+ while len(curr_substring) > 0:
+ if curr_substring.startswith('function'):
+ # Handle functions separately. These can contain commas
+ match = func_regex.search(curr_substring)
+ match_start, match_end = match.span()
+
+ function_text = find_object_from_startpoint(curr_substring, match.span()[1])
+ full_function_def = curr_substring[:match_end + len(function_text)]
+ results.append(full_function_def)
+ curr_substring = curr_substring[len(full_function_def) + 1:]
+ else:
+ match = comma_regex.search(curr_substring)
+
+ # Try-catch to capture end of array
+ try:
+ match_start, match_end = match.span()
+ except AttributeError:
+ match_start = len(curr_substring) - 1
+ match_end = match_start + 1
+
+ curr_el = curr_substring[:match_start]
+ results.append(curr_el)
+ curr_substring = curr_substring[match_end:]
+
+ return results
diff --git a/pytube/query.py b/pytube/query.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4878ba86c5a12c035b843b13a17aba19f511236
--- /dev/null
+++ b/pytube/query.py
@@ -0,0 +1,421 @@
+"""This module provides a query interface for media streams and captions."""
+from collections.abc import Mapping, Sequence
+from typing import Callable, List, Optional, Union
+
+from pytube import Caption, Stream
+from pytube.helpers import deprecated
+
+
+class StreamQuery(Sequence):
+ """Interface for querying the available media streams."""
+
+ def __init__(self, fmt_streams):
+ """Construct a :class:`StreamQuery `.
+
+ param list fmt_streams:
+ list of :class:`Stream ` instances.
+ """
+ self.fmt_streams = fmt_streams
+ self.itag_index = {int(s.itag): s for s in fmt_streams}
+
+ def filter(
+ self,
+ fps=None,
+ res=None,
+ resolution=None,
+ mime_type=None,
+ type=None,
+ subtype=None,
+ file_extension=None,
+ abr=None,
+ bitrate=None,
+ video_codec=None,
+ audio_codec=None,
+ only_audio=None,
+ only_video=None,
+ progressive=None,
+ adaptive=None,
+ is_dash=None,
+ custom_filter_functions=None,
+ ):
+ """Apply the given filtering criterion.
+
+ :param fps:
+ (optional) The frames per second.
+ :type fps:
+ int or None
+
+ :param resolution:
+ (optional) Alias to ``res``.
+ :type res:
+ str or None
+
+ :param res:
+ (optional) The video resolution.
+ :type resolution:
+ str or None
+
+ :param mime_type:
+ (optional) Two-part identifier for file formats and format contents
+ composed of a "type", a "subtype".
+ :type mime_type:
+ str or None
+
+ :param type:
+ (optional) Type part of the ``mime_type`` (e.g.: audio, video).
+ :type type:
+ str or None
+
+ :param subtype:
+ (optional) Sub-type part of the ``mime_type`` (e.g.: mp4, mov).
+ :type subtype:
+ str or None
+
+ :param file_extension:
+ (optional) Alias to ``sub_type``.
+ :type file_extension:
+ str or None
+
+ :param abr:
+ (optional) Average bitrate (ABR) refers to the average amount of
+ data transferred per unit of time (e.g.: 64kbps, 192kbps).
+ :type abr:
+ str or None
+
+ :param bitrate:
+ (optional) Alias to ``abr``.
+ :type bitrate:
+ str or None
+
+ :param video_codec:
+ (optional) Video compression format.
+ :type video_codec:
+ str or None
+
+ :param audio_codec:
+ (optional) Audio compression format.
+ :type audio_codec:
+ str or None
+
+ :param bool progressive:
+ Excludes adaptive streams (one file contains both audio and video
+ tracks).
+
+ :param bool adaptive:
+ Excludes progressive streams (audio and video are on separate
+ tracks).
+
+ :param bool is_dash:
+ Include/exclude dash streams.
+
+ :param bool only_audio:
+ Excludes streams with video tracks.
+
+ :param bool only_video:
+ Excludes streams with audio tracks.
+
+ :param custom_filter_functions:
+ (optional) Interface for defining complex filters without
+ subclassing.
+ :type custom_filter_functions:
+ list or None
+
+ """
+ filters = []
+ if res or resolution:
+ filters.append(lambda s: s.resolution == (res or resolution))
+
+ if fps:
+ filters.append(lambda s: s.fps == fps)
+
+ if mime_type:
+ filters.append(lambda s: s.mime_type == mime_type)
+
+ if type:
+ filters.append(lambda s: s.type == type)
+
+ if subtype or file_extension:
+ filters.append(lambda s: s.subtype == (subtype or file_extension))
+
+ if abr or bitrate:
+ filters.append(lambda s: s.abr == (abr or bitrate))
+
+ if video_codec:
+ filters.append(lambda s: s.video_codec == video_codec)
+
+ if audio_codec:
+ filters.append(lambda s: s.audio_codec == audio_codec)
+
+ if only_audio:
+ filters.append(
+ lambda s: (
+ s.includes_audio_track and not s.includes_video_track
+ ),
+ )
+
+ if only_video:
+ filters.append(
+ lambda s: (
+ s.includes_video_track and not s.includes_audio_track
+ ),
+ )
+
+ if progressive:
+ filters.append(lambda s: s.is_progressive)
+
+ if adaptive:
+ filters.append(lambda s: s.is_adaptive)
+
+ if custom_filter_functions:
+ filters.extend(custom_filter_functions)
+
+ if is_dash is not None:
+ filters.append(lambda s: s.is_dash == is_dash)
+
+ return self._filter(filters)
+
+ def _filter(self, filters: List[Callable]) -> "StreamQuery":
+ fmt_streams = self.fmt_streams
+ for filter_lambda in filters:
+ fmt_streams = filter(filter_lambda, fmt_streams)
+ return StreamQuery(list(fmt_streams))
+
+ def order_by(self, attribute_name: str) -> "StreamQuery":
+ """Apply a sort order. Filters out stream the do not have the attribute.
+
+ :param str attribute_name:
+ The name of the attribute to sort by.
+ """
+ has_attribute = [
+ s
+ for s in self.fmt_streams
+ if getattr(s, attribute_name) is not None
+ ]
+ # Check that the attributes have string values.
+ if has_attribute and isinstance(
+ getattr(has_attribute[0], attribute_name), str
+ ):
+ # Try to return a StreamQuery sorted by the integer representations
+ # of the values.
+ try:
+ return StreamQuery(
+ sorted(
+ has_attribute,
+ key=lambda s: int(
+ "".join(
+ filter(str.isdigit, getattr(s, attribute_name))
+ )
+ ), # type: ignore # noqa: E501
+ )
+ )
+ except ValueError:
+ pass
+
+ return StreamQuery(
+ sorted(has_attribute, key=lambda s: getattr(s, attribute_name))
+ )
+
+ def desc(self) -> "StreamQuery":
+ """Sort streams in descending order.
+
+ :rtype: :class:`StreamQuery `
+
+ """
+ return StreamQuery(self.fmt_streams[::-1])
+
+ def asc(self) -> "StreamQuery":
+ """Sort streams in ascending order.
+
+ :rtype: :class:`StreamQuery `
+
+ """
+ return self
+
+ def get_by_itag(self, itag: int) -> Optional[Stream]:
+ """Get the corresponding :class:`Stream ` for a given itag.
+
+ :param int itag:
+ YouTube format identifier code.
+ :rtype: :class:`Stream ` or None
+ :returns:
+ The :class:`Stream ` matching the given itag or None if
+ not found.
+
+ """
+ return self.itag_index.get(int(itag))
+
+ def get_by_resolution(self, resolution: str) -> Optional[Stream]:
+ """Get the corresponding :class:`Stream ` for a given resolution.
+
+ Stream must be a progressive mp4.
+
+ :param str resolution:
+ Video resolution i.e. "720p", "480p", "360p", "240p", "144p"
+ :rtype: :class:`Stream ` or None
+ :returns:
+ The :class:`Stream ` matching the given itag or None if
+ not found.
+
+ """
+ return self.filter(
+ progressive=True, subtype="mp4", resolution=resolution
+ ).first()
+
+ def get_lowest_resolution(self) -> Optional[Stream]:
+ """Get lowest resolution stream that is a progressive mp4.
+
+ :rtype: :class:`Stream ` or None
+ :returns:
+ The :class:`Stream ` matching the given itag or None if
+ not found.
+
+ """
+ return (
+ self.filter(progressive=True, subtype="mp4")
+ .order_by("resolution")
+ .first()
+ )
+
+ def get_highest_resolution(self) -> Optional[Stream]:
+ """Get highest resolution stream that is a progressive video.
+
+ :rtype: :class:`Stream ` or None
+ :returns:
+ The :class:`Stream ` matching the given itag or None if
+ not found.
+
+ """
+ return self.filter(progressive=True).order_by("resolution").last()
+
+ def get_audio_only(self, subtype: str = "mp4") -> Optional[Stream]:
+ """Get highest bitrate audio stream for given codec (defaults to mp4)
+
+ :param str subtype:
+ Audio subtype, defaults to mp4
+ :rtype: :class:`Stream ` or None
+ :returns:
+ The :class:`Stream ` matching the given itag or None if
+ not found.
+ """
+ return (
+ self.filter(only_audio=True, subtype=subtype)
+ .order_by("abr")
+ .last()
+ )
+
+ def otf(self, is_otf: bool = False) -> "StreamQuery":
+ """Filter stream by OTF, useful if some streams have 404 URLs
+
+ :param bool is_otf: Set to False to retrieve only non-OTF streams
+ :rtype: :class:`StreamQuery `
+ :returns: A StreamQuery object with otf filtered streams
+ """
+ return self._filter([lambda s: s.is_otf == is_otf])
+
+ def first(self) -> Optional[Stream]:
+ """Get the first :class:`Stream ` in the results.
+
+ :rtype: :class:`Stream ` or None
+ :returns:
+ the first result of this query or None if the result doesn't
+ contain any streams.
+
+ """
+ try:
+ return self.fmt_streams[0]
+ except IndexError:
+ return None
+
+ def last(self):
+ """Get the last :class:`Stream ` in the results.
+
+ :rtype: :class:`Stream ` or None
+ :returns:
+ Return the last result of this query or None if the result
+ doesn't contain any streams.
+
+ """
+ try:
+ return self.fmt_streams[-1]
+ except IndexError:
+ pass
+
+ @deprecated("Get the size of this list directly using len()")
+ def count(self, value: Optional[str] = None) -> int: # pragma: no cover
+ """Get the count of items in the list.
+
+ :rtype: int
+ """
+ if value:
+ return self.fmt_streams.count(value)
+
+ return len(self)
+
+ @deprecated("This object can be treated as a list, all() is useless")
+ def all(self) -> List[Stream]: # pragma: no cover
+ """Get all the results represented by this query as a list.
+
+ :rtype: list
+
+ """
+ return self.fmt_streams
+
+ def __getitem__(self, i: Union[slice, int]):
+ return self.fmt_streams[i]
+
+ def __len__(self) -> int:
+ return len(self.fmt_streams)
+
+ def __repr__(self) -> str:
+ return f"{self.fmt_streams}"
+
+
+class CaptionQuery(Mapping):
+ """Interface for querying the available captions."""
+
+ def __init__(self, captions: List[Caption]):
+ """Construct a :class:`Caption `.
+
+ param list captions:
+ list of :class:`Caption ` instances.
+
+ """
+ self.lang_code_index = {c.code: c for c in captions}
+
+ @deprecated(
+ "This object can be treated as a dictionary, i.e. captions['en']"
+ )
+ def get_by_language_code(
+ self, lang_code: str
+ ) -> Optional[Caption]: # pragma: no cover
+ """Get the :class:`Caption ` for a given ``lang_code``.
+
+ :param str lang_code:
+ The code that identifies the caption language.
+ :rtype: :class:`Caption ` or None
+ :returns:
+ The :class:`Caption ` matching the given ``lang_code`` or
+ None if it does not exist.
+ """
+ return self.lang_code_index.get(lang_code)
+
+ @deprecated("This object can be treated as a dictionary")
+ def all(self) -> List[Caption]: # pragma: no cover
+ """Get all the results represented by this query as a list.
+
+ :rtype: list
+
+ """
+ return list(self.lang_code_index.values())
+
+ def __getitem__(self, i: str):
+ return self.lang_code_index[i]
+
+ def __len__(self) -> int:
+ return len(self.lang_code_index)
+
+ def __iter__(self):
+ return iter(self.lang_code_index.values())
+
+ def __repr__(self) -> str:
+ return f"{self.lang_code_index}"
diff --git a/pytube/request.py b/pytube/request.py
new file mode 100644
index 0000000000000000000000000000000000000000..e66a4642da68c0f9b121498ad0d71534741eee30
--- /dev/null
+++ b/pytube/request.py
@@ -0,0 +1,265 @@
+"""Implements a simple wrapper around urlopen."""
+import http.client
+import json
+import logging
+import re
+import socket
+from functools import lru_cache
+from urllib import parse
+from urllib.error import URLError
+from urllib.request import Request, urlopen
+
+from pytube.exceptions import RegexMatchError, MaxRetriesExceeded
+from pytube.helpers import regex_search
+
+logger = logging.getLogger(__name__)
+default_range_size = 9437184 # 9MB
+
+
+def _execute_request(
+ url,
+ method=None,
+ headers=None,
+ data=None,
+ timeout=socket._GLOBAL_DEFAULT_TIMEOUT
+):
+ base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
+ if headers:
+ base_headers.update(headers)
+ if data:
+ # encode data for request
+ if not isinstance(data, bytes):
+ data = bytes(json.dumps(data), encoding="utf-8")
+ if url.lower().startswith("http"):
+ request = Request(url, headers=base_headers, method=method, data=data)
+ else:
+ raise ValueError("Invalid URL")
+ return urlopen(request, timeout=timeout) # nosec
+
+
+def get(url, extra_headers=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
+ """Send an http GET request.
+
+ :param str url:
+ The URL to perform the GET request for.
+ :param dict extra_headers:
+ Extra headers to add to the request
+ :rtype: str
+ :returns:
+ UTF-8 encoded string of response
+ """
+ if extra_headers is None:
+ extra_headers = {}
+ response = _execute_request(url, headers=extra_headers, timeout=timeout)
+ return response.read().decode("utf-8")
+
+
+def post(url, extra_headers=None, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
+ """Send an http POST request.
+
+ :param str url:
+ The URL to perform the POST request for.
+ :param dict extra_headers:
+ Extra headers to add to the request
+ :param dict data:
+ The data to send on the POST request
+ :rtype: str
+ :returns:
+ UTF-8 encoded string of response
+ """
+ # could technically be implemented in get,
+ # but to avoid confusion implemented like this
+ if extra_headers is None:
+ extra_headers = {}
+ if data is None:
+ data = {}
+ # required because the youtube servers are strict on content type
+ # raises HTTPError [400]: Bad Request otherwise
+ extra_headers.update({"Content-Type": "application/json"})
+ response = _execute_request(
+ url,
+ headers=extra_headers,
+ data=data,
+ timeout=timeout
+ )
+ return response.read().decode("utf-8")
+
+
+def seq_stream(
+ url,
+ timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
+ max_retries=0
+):
+ """Read the response in sequence.
+ :param str url: The URL to perform the GET request for.
+ :rtype: Iterable[bytes]
+ """
+ # YouTube expects a request sequence number as part of the parameters.
+ split_url = parse.urlsplit(url)
+ base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
+
+ querys = dict(parse.parse_qsl(split_url.query))
+
+ # The 0th sequential request provides the file headers, which tell us
+ # information about how the file is segmented.
+ querys['sq'] = 0
+ url = base_url + parse.urlencode(querys)
+
+ segment_data = b''
+ for chunk in stream(url, timeout=timeout, max_retries=max_retries):
+ yield chunk
+ segment_data += chunk
+
+ # We can then parse the header to find the number of segments
+ stream_info = segment_data.split(b'\r\n')
+ segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
+ for line in stream_info:
+ match = segment_count_pattern.search(line)
+ if match:
+ segment_count = int(match.group(1).decode('utf-8'))
+
+ # We request these segments sequentially to build the file.
+ seq_num = 1
+ while seq_num <= segment_count:
+ # Create sequential request URL
+ querys['sq'] = seq_num
+ url = base_url + parse.urlencode(querys)
+
+ yield from stream(url, timeout=timeout, max_retries=max_retries)
+ seq_num += 1
+ return # pylint: disable=R1711
+
+
+def stream(
+ url,
+ timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
+ max_retries=0
+):
+ """Read the response in chunks.
+ :param str url: The URL to perform the GET request for.
+ :rtype: Iterable[bytes]
+ """
+ file_size: int = default_range_size # fake filesize to start
+ downloaded = 0
+ while downloaded < file_size:
+ stop_pos = min(downloaded + default_range_size, file_size) - 1
+ range_header = f"bytes={downloaded}-{stop_pos}"
+ tries = 0
+
+ # Attempt to make the request multiple times as necessary.
+ while True:
+ # If the max retries is exceeded, raise an exception
+ if tries >= 1 + max_retries:
+ raise MaxRetriesExceeded()
+
+ # Try to execute the request, ignoring socket timeouts
+ try:
+ response = _execute_request(
+ url,
+ method="GET",
+ headers={"Range": range_header},
+ timeout=timeout
+ )
+ except URLError as e:
+ # We only want to skip over timeout errors, and
+ # raise any other URLError exceptions
+ if isinstance(e.reason, socket.timeout):
+ pass
+ else:
+ raise
+ except http.client.IncompleteRead:
+ # Allow retries on IncompleteRead errors for unreliable connections
+ pass
+ else:
+ # On a successful request, break from loop
+ break
+ tries += 1
+
+ if file_size == default_range_size:
+ try:
+ content_range = response.info()["Content-Range"]
+ file_size = int(content_range.split("/")[1])
+ except (KeyError, IndexError, ValueError) as e:
+ logger.error(e)
+ while True:
+ chunk = response.read()
+ if not chunk:
+ break
+ downloaded += len(chunk)
+ yield chunk
+ return # pylint: disable=R1711
+
+
+@lru_cache()
+def filesize(url):
+ """Fetch size in bytes of file at given URL
+
+ :param str url: The URL to get the size of
+ :returns: int: size in bytes of remote file
+ """
+ return int(head(url)["content-length"])
+
+
+@lru_cache()
+def seq_filesize(url):
+ """Fetch size in bytes of file at given URL from sequential requests
+
+ :param str url: The URL to get the size of
+ :returns: int: size in bytes of remote file
+ """
+ total_filesize = 0
+ # YouTube expects a request sequence number as part of the parameters.
+ split_url = parse.urlsplit(url)
+ base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
+ querys = dict(parse.parse_qsl(split_url.query))
+
+ # The 0th sequential request provides the file headers, which tell us
+ # information about how the file is segmented.
+ querys['sq'] = 0
+ url = base_url + parse.urlencode(querys)
+ response = _execute_request(
+ url, method="GET"
+ )
+
+ response_value = response.read()
+ # The file header must be added to the total filesize
+ total_filesize += len(response_value)
+
+ # We can then parse the header to find the number of segments
+ segment_count = 0
+ stream_info = response_value.split(b'\r\n')
+ segment_regex = b'Segment-Count: (\\d+)'
+ for line in stream_info:
+ # One of the lines should contain the segment count, but we don't know
+ # which, so we need to iterate through the lines to find it
+ try:
+ segment_count = int(regex_search(segment_regex, line, 1))
+ except RegexMatchError:
+ pass
+
+ if segment_count == 0:
+ raise RegexMatchError('seq_filesize', segment_regex)
+
+ # We make HEAD requests to the segments sequentially to find the total filesize.
+ seq_num = 1
+ while seq_num <= segment_count:
+ # Create sequential request URL
+ querys['sq'] = seq_num
+ url = base_url + parse.urlencode(querys)
+
+ total_filesize += int(head(url)['content-length'])
+ seq_num += 1
+ return total_filesize
+
+
+def head(url):
+ """Fetch headers returned http GET request.
+
+ :param str url:
+ The URL to perform the GET request for.
+ :rtype: dict
+ :returns:
+ dictionary of lowercase headers
+ """
+ response_headers = _execute_request(url, method="HEAD").info()
+ return {k.lower(): v for k, v in response_headers.items()}
diff --git a/pytube/streams.py b/pytube/streams.py
new file mode 100644
index 0000000000000000000000000000000000000000..05ec6c1afc4cc5c9f9c723024d8e7febb353ade1
--- /dev/null
+++ b/pytube/streams.py
@@ -0,0 +1,374 @@
+"""
+This module contains a container for stream manifest data.
+
+A container object for the media stream (video only / audio only / video+audio
+combined). This was referred to as ``Video`` in the legacy pytube version, but
+has been renamed to accommodate DASH (which serves the audio and video
+separately).
+"""
+import logging
+import os
+from datetime import datetime
+from typing import BinaryIO, Dict, Optional, Tuple
+from urllib.error import HTTPError
+from urllib.parse import parse_qs
+
+from pytube import extract, request
+from pytube.helpers import safe_filename, target_directory
+from pytube.itags import get_format_profile
+from pytube.monostate import Monostate
+
+logger = logging.getLogger(__name__)
+
+
+class Stream:
+ """Container for stream manifest data."""
+
+ def __init__(
+ self, stream: Dict, monostate: Monostate
+ ):
+ """Construct a :class:`Stream `.
+
+ :param dict stream:
+ The unscrambled data extracted from YouTube.
+ :param dict monostate:
+ Dictionary of data shared across all instances of
+ :class:`Stream `.
+ """
+ # A dictionary shared between all instances of :class:`Stream `
+ # (Borg pattern).
+ self._monostate = monostate
+
+ self.url = stream["url"] # signed download url
+ self.itag = int(
+ stream["itag"]
+ ) # stream format id (youtube nomenclature)
+
+ # set type and codec info
+
+ # 'video/webm; codecs="vp8, vorbis"' -> 'video/webm', ['vp8', 'vorbis']
+ self.mime_type, self.codecs = extract.mime_type_codec(stream["mimeType"])
+
+ # 'video/webm' -> 'video', 'webm'
+ self.type, self.subtype = self.mime_type.split("/")
+
+ # ['vp8', 'vorbis'] -> video_codec: vp8, audio_codec: vorbis. DASH
+ # streams return NoneType for audio/video depending.
+ self.video_codec, self.audio_codec = self.parse_codecs()
+
+ self.is_otf: bool = stream["is_otf"]
+ self.bitrate: Optional[int] = stream["bitrate"]
+
+ # filesize in bytes
+ self._filesize: Optional[int] = int(stream.get('contentLength', 0))
+
+ # Additional information about the stream format, such as resolution,
+ # frame rate, and whether the stream is live (HLS) or 3D.
+ itag_profile = get_format_profile(self.itag)
+ self.is_dash = itag_profile["is_dash"]
+ self.abr = itag_profile["abr"] # average bitrate (audio streams only)
+ if 'fps' in stream:
+ self.fps = stream['fps'] # Video streams only
+ self.resolution = itag_profile[
+ "resolution"
+ ] # resolution (e.g.: "480p")
+ self.is_3d = itag_profile["is_3d"]
+ self.is_hdr = itag_profile["is_hdr"]
+ self.is_live = itag_profile["is_live"]
+
+ @property
+ def is_adaptive(self) -> bool:
+ """Whether the stream is DASH.
+
+ :rtype: bool
+ """
+ # if codecs has two elements (e.g.: ['vp8', 'vorbis']): 2 % 2 = 0
+ # if codecs has one element (e.g.: ['vp8']) 1 % 2 = 1
+ return bool(len(self.codecs) % 2)
+
+ @property
+ def is_progressive(self) -> bool:
+ """Whether the stream is progressive.
+
+ :rtype: bool
+ """
+ return not self.is_adaptive
+
+ @property
+ def includes_audio_track(self) -> bool:
+ """Whether the stream only contains audio.
+
+ :rtype: bool
+ """
+ return self.is_progressive or self.type == "audio"
+
+ @property
+ def includes_video_track(self) -> bool:
+ """Whether the stream only contains video.
+
+ :rtype: bool
+ """
+ return self.is_progressive or self.type == "video"
+
+ def parse_codecs(self) -> Tuple[Optional[str], Optional[str]]:
+ """Get the video/audio codecs from list of codecs.
+
+ Parse a variable length sized list of codecs and returns a
+ constant two element tuple, with the video codec as the first element
+ and audio as the second. Returns None if one is not available
+ (adaptive only).
+
+ :rtype: tuple
+ :returns:
+ A two element tuple with audio and video codecs.
+
+ """
+ video = None
+ audio = None
+ if not self.is_adaptive:
+ video, audio = self.codecs
+ elif self.includes_video_track:
+ video = self.codecs[0]
+ elif self.includes_audio_track:
+ audio = self.codecs[0]
+ return video, audio
+
+ @property
+ def filesize(self) -> int:
+ """File size of the media stream in bytes.
+
+ :rtype: int
+ :returns:
+ Filesize (in bytes) of the stream.
+ """
+ if self._filesize == 0:
+ try:
+ self._filesize = request.filesize(self.url)
+ except HTTPError as e:
+ if e.code != 404:
+ raise
+ self._filesize = request.seq_filesize(self.url)
+ return self._filesize
+
+ @property
+ def title(self) -> str:
+ """Get title of video
+
+ :rtype: str
+ :returns:
+ Youtube video title
+ """
+ return self._monostate.title or "Unknown YouTube Video Title"
+
+ @property
+ def filesize_approx(self) -> int:
+ """Get approximate filesize of the video
+
+ Falls back to HTTP call if there is not sufficient information to approximate
+
+ :rtype: int
+ :returns: size of video in bytes
+ """
+ if self._monostate.duration and self.bitrate:
+ bits_in_byte = 8
+ return int(
+ (self._monostate.duration * self.bitrate) / bits_in_byte
+ )
+
+ return self.filesize
+
+ @property
+ def expiration(self) -> datetime:
+ expire = parse_qs(self.url.split("?")[1])["expire"][0]
+ return datetime.utcfromtimestamp(int(expire))
+
+ @property
+ def default_filename(self) -> str:
+ """Generate filename based on the video title.
+
+ :rtype: str
+ :returns:
+ An os file system compatible filename.
+ """
+ filename = safe_filename(self.title)
+ return f"{filename}.{self.subtype}"
+
+ def download(
+ self,
+ output_path: Optional[str] = None,
+ filename: Optional[str] = None,
+ filename_prefix: Optional[str] = None,
+ skip_existing: bool = True,
+ timeout: Optional[int] = None,
+ max_retries: Optional[int] = 0
+ ) -> str:
+ """Write the media stream to disk.
+
+ :param output_path:
+ (optional) Output path for writing media file. If one is not
+ specified, defaults to the current working directory.
+ :type output_path: str or None
+ :param filename:
+ (optional) Output filename (stem only) for writing media file.
+ If one is not specified, the default filename is used.
+ :type filename: str or None
+ :param filename_prefix:
+ (optional) A string that will be prepended to the filename.
+ For example a number in a playlist or the name of a series.
+ If one is not specified, nothing will be prepended
+ This is separate from filename so you can use the default
+ filename but still add a prefix.
+ :type filename_prefix: str or None
+ :param skip_existing:
+ (optional) Skip existing files, defaults to True
+ :type skip_existing: bool
+ :param timeout:
+ (optional) Request timeout length in seconds. Uses system default.
+ :type timeout: int
+ :param max_retries:
+ (optional) Number of retries to attempt after socket timeout. Defaults to 0.
+ :type max_retries: int
+ :returns:
+ Path to the saved video
+ :rtype: str
+
+ """
+ file_path = self.get_file_path(
+ filename=filename,
+ output_path=output_path,
+ filename_prefix=filename_prefix,
+ )
+
+ if skip_existing and self.exists_at_path(file_path):
+ logger.debug(f'file {file_path} already exists, skipping')
+ self.on_complete(file_path)
+ return file_path
+
+ bytes_remaining = self.filesize
+ logger.debug(f'downloading ({self.filesize} total bytes) file to {file_path}')
+
+ with open(file_path, "wb") as fh:
+ try:
+ for chunk in request.stream(
+ self.url,
+ timeout=timeout,
+ max_retries=max_retries
+ ):
+ # reduce the (bytes) remainder by the length of the chunk.
+ bytes_remaining -= len(chunk)
+ # send to the on_progress callback.
+ self.on_progress(chunk, fh, bytes_remaining)
+ except HTTPError as e:
+ if e.code != 404:
+ raise
+ # Some adaptive streams need to be requested with sequence numbers
+ for chunk in request.seq_stream(
+ self.url,
+ timeout=timeout,
+ max_retries=max_retries
+ ):
+ # reduce the (bytes) remainder by the length of the chunk.
+ bytes_remaining -= len(chunk)
+ # send to the on_progress callback.
+ self.on_progress(chunk, fh, bytes_remaining)
+ self.on_complete(file_path)
+ return file_path
+
+ def get_file_path(
+ self,
+ filename: Optional[str] = None,
+ output_path: Optional[str] = None,
+ filename_prefix: Optional[str] = None,
+ ) -> str:
+ if not filename:
+ filename = self.default_filename
+ if filename_prefix:
+ filename = f"{filename_prefix}{filename}"
+ return os.path.join(target_directory(output_path), filename)
+
+ def exists_at_path(self, file_path: str) -> bool:
+ return (
+ os.path.isfile(file_path)
+ and os.path.getsize(file_path) == self.filesize
+ )
+
+ def stream_to_buffer(self, buffer: BinaryIO) -> None:
+ """Write the media stream to buffer
+
+ :rtype: io.BytesIO buffer
+ """
+ bytes_remaining = self.filesize
+ logger.info(
+ "downloading (%s total bytes) file to buffer", self.filesize,
+ )
+
+ for chunk in request.stream(self.url):
+ # reduce the (bytes) remainder by the length of the chunk.
+ bytes_remaining -= len(chunk)
+ # send to the on_progress callback.
+ self.on_progress(chunk, buffer, bytes_remaining)
+ self.on_complete(None)
+
+ def on_progress(
+ self, chunk: bytes, file_handler: BinaryIO, bytes_remaining: int
+ ):
+ """On progress callback function.
+
+ This function writes the binary data to the file, then checks if an
+ additional callback is defined in the monostate. This is exposed to
+ allow things like displaying a progress bar.
+
+ :param bytes chunk:
+ Segment of media file binary data, not yet written to disk.
+ :param file_handler:
+ The file handle where the media is being written to.
+ :type file_handler:
+ :py:class:`io.BufferedWriter`
+ :param int bytes_remaining:
+ The delta between the total file size in bytes and amount already
+ downloaded.
+
+ :rtype: None
+
+ """
+ file_handler.write(chunk)
+ logger.debug("download remaining: %s", bytes_remaining)
+ if self._monostate.on_progress:
+ self._monostate.on_progress(self, chunk, bytes_remaining)
+
+ def on_complete(self, file_path: Optional[str]):
+ """On download complete handler function.
+
+ :param file_path:
+ The file handle where the media is being written to.
+ :type file_path: str
+
+ :rtype: None
+
+ """
+ logger.debug("download finished")
+ on_complete = self._monostate.on_complete
+ if on_complete:
+ logger.debug("calling on_complete callback %s", on_complete)
+ on_complete(self, file_path)
+
+ def __repr__(self) -> str:
+ """Printable object representation.
+
+ :rtype: str
+ :returns:
+ A string representation of a :class:`Stream ` object.
+ """
+ parts = ['itag="{s.itag}"', 'mime_type="{s.mime_type}"']
+ if self.includes_video_track:
+ parts.extend(['res="{s.resolution}"', 'fps="{s.fps}fps"'])
+ if not self.is_adaptive:
+ parts.extend(
+ ['vcodec="{s.video_codec}"', 'acodec="{s.audio_codec}"',]
+ )
+ else:
+ parts.extend(['vcodec="{s.video_codec}"'])
+ else:
+ parts.extend(['abr="{s.abr}"', 'acodec="{s.audio_codec}"'])
+ parts.extend(['progressive="{s.is_progressive}"', 'type="{s.type}"'])
+ return f""
diff --git a/pytube/version.py b/pytube/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..facad8adb3095438f19645eb554baa6896eab434
--- /dev/null
+++ b/pytube/version.py
@@ -0,0 +1,4 @@
+__version__ = "12.0.0.123"
+
+if __name__ == "__main__":
+ print(__version__)
diff --git a/repunct.py b/repunct.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f352201085afeb72e8e4f1af76d3ab0d7faa51
--- /dev/null
+++ b/repunct.py
@@ -0,0 +1,7 @@
+from myrpunct import RestorePuncts
+
+def predict(input_text):
+ rpunct = RestorePuncts()
+ output_text = rpunct.punctuate(input_text)
+ print("Punctuation finished...")
+ return output_text
diff --git a/summarizer.py b/summarizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..171294c94e3f19ff4dea4ff923e4af24e2e18ed1
--- /dev/null
+++ b/summarizer.py
@@ -0,0 +1,153 @@
+import transcript as ts
+import ytvideo as vd
+import frames as fr
+#import repunct as rp
+import lexrank as lr
+
+# import sys
+# del sys.modules['ytvideo']
+# del sys.modules['transcript']
+# del sys.modules['frames']
+# del sys.modules['lexrank']
+
+
+#########################################################################
+# LEXRANK SUMMARY
+#########################################################################
+
+def getSummaryImage(link, lexrank_switch, rpunkt_switch):
+
+ # cleanup the working directory
+ #result = fr.removeFilesInWorkdir()
+ #print('removeFilesInWorkdir result: ',result)
+
+ if len(link) == 0:
+ return 'Error: no link provided'
+
+ print('getting transcript using link: ', link)
+ raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
+ print('transcript type: ', type_transcript)
+ #timestamps = ts.get_timestamps(raw_transcript)
+ raw_caption = ts.get_caption(raw_transcript)
+
+ # module rpunct
+ # restore punctuations from raw captions
+ # if necessary
+ pnct_raw_transcript = raw_transcript
+ pnct_caption = raw_caption
+
+ dict_sentences = ts.getSentences(pnct_raw_transcript)
+
+ concat_list_summary = 'empty'
+ if lexrank_switch:
+ # summarize small part of the text
+ nr_sentences = round(len(dict_sentences)*0.05)
+ trunc_pnct_caption = ' '.join(dict_sentences.values())
+ list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
+ # it can happen that for lexrank a sentence conists of multiple actual sentences,
+ # that are separated with full stops. Then the correspoinding timestamp cannot be found
+ # all items from the lexrank summary must be concatinated and split up by full stops.
+ concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
+ print('zip: '+str(nr_sentences))
+ if nr_sentences == 0:
+ return 'Error: No sentences available', None
+ else:
+ concat_list_summary = [*dict_sentences.values()]
+
+ dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
+ if 'Error' in dict_timestamp_summary:
+ return dict_timestamp_summary
+
+ result_get_video=vd.get_video(link)
+ print('video: '+result_get_video)
+
+ proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
+ print('frames: '+str(proc_list))
+
+ images = ts.getImages(dict_timestamp_summary)
+
+ return images
+
+
+def getSummary(link, lexrank_switch, rpunkt_switch):
+
+ # cleanup the working directory
+ #result = fr.removeFilesInWorkdir()
+ #print('removeFilesInWorkdir result: ',result)
+
+ if len(link) == 0:
+ return 'Error: no link provided'
+
+ print('getting transcript using link: ', link)
+ raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
+ print('transcript type: ', type_transcript)
+ #timestamps = ts.get_timestamps(raw_transcript)
+ raw_caption = ts.get_caption(raw_transcript)
+
+ # module rpunct
+ # restore punctuations from raw captions
+ # if necessary
+ pnct_raw_transcript = raw_transcript
+ pnct_caption = raw_caption
+
+ if rpunkt_switch:
+ #if type_transcript[0] == 'en':
+ # the variable type_transcript[1] contains the text 'generated' or 'translated'
+ print('Recovering punctuation from english text...', type_transcript[1])
+ # remove punctuation leftovers
+ #clean_raw_caption = re.sub('[,?.!]','',raw_caption)
+ caption = rp.predict(raw_caption)
+ pnct_caption = ts.restore_cr(raw_caption,caption)
+ pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)
+
+ dict_sentences = ts.getSentences(pnct_raw_transcript)
+
+ concat_list_summary = 'empty'
+ if lexrank_switch:
+ # summarize small part of the text
+ nr_sentences = round(len(dict_sentences)*0.05)
+ trunc_pnct_caption = ' '.join(dict_sentences.values())
+ list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
+ # it can happen that for lexrank a sentence conists of multiple actual sentences,
+ # that are separated with full stops. Then the correspoinding timestamp cannot be found
+ # all items from the lexrank summary must be concatinated and split up by full stops.
+ concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
+ print('zip: '+str(nr_sentences))
+ if nr_sentences == 0:
+ return 'Error: No sentences available', None
+ else:
+ concat_list_summary = [*dict_sentences.values()]
+
+ dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
+ if 'Error' in dict_timestamp_summary:
+ return dict_timestamp_summary
+
+ result_get_video=vd.get_video(link)
+ print('video: '+result_get_video)
+
+ proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
+ print('frames: '+str(proc_list))
+
+ html_file = ts.convertToHTML(dict_timestamp_summary)
+ images = ts.getImages(dict_timestamp_summary)
+
+ return html_file, images
+
+
+#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
+#with open(filename, 'w') as the_file:
+# the_file.write(raw_caption)
+
+#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
+#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
+#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
+#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
+#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
+#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
+#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
+#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc
+
+#lexrank = True
+#result = getSummary(link, lexrank)
+#print(result)
+
diff --git a/transcript.py b/transcript.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a90bf433f7eae9a116e95a202a392369995f978
--- /dev/null
+++ b/transcript.py
@@ -0,0 +1,201 @@
+from youtube_transcript_api import YouTubeTranscriptApi
+import re
+from PIL import Image
+
+
+#transcript_list = YouTubeTranscriptApi.list_transcripts('ReHGSGwV4-A')
+#transcript = transcript_list.find_transcript(['en','de'])
+
+# step 1: download the json transcript for youtube video
+def get_json_transcript(link,rpunkt_switch):
+ if "v=" in link:
+ video_id = link.split("v=")[1].split("&")[0]
+ else:
+ return "Error: Invalid Link, it does not have the pattern 'v=' in it."
+
+
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+ # get the auto-generated english text
+ # if it is not available translate to en
+ raw_transcript = 'empty'
+ type_transcript = []
+ if rpunkt_switch:
+ try:
+ transcript = transcript_list.find_generated_transcript(['en'])
+ raw_transcript = transcript.fetch()
+ type_transcript = ['en','generated']
+ except:
+ transcript = transcript_list.find_transcript(['de'])
+ raw_transcript = transcript.translate('en').fetch()
+ type_transcript = ['en','translated']
+ else:
+ transcript = transcript_list.find_transcript(['en','de'])
+ raw_transcript = transcript.fetch()
+ type_transcript = ['den','manual']
+
+ return raw_transcript, type_transcript
+
+# step 2: extract timestamps from json transcript
+def get_timestamps(transcript_raw):
+ transcript_timestamps = '\n'.join([str(i['start']) for i in transcript_raw])
+ return transcript_timestamps.split('\n')
+
+# step 3: extract text from transcript
+def get_caption(transcript_raw):
+ transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
+ return transcript_text
+
+def replacePunctuatedText(raw_transcript, caption):
+ list_caption = caption.split('\n')
+ pnct_raw_transcript = raw_transcript
+
+ for (idx, line) in enumerate(pnct_raw_transcript):
+ line['text']=list_caption[idx]
+
+ return pnct_raw_transcript
+
+def getSentences(raw_transcript):
+ # walk over each frame and extract the time stamp and the text
+ # the time stamp is wrapped in hash tag signs
+ frm_cap = ''
+ for (idx, line) in enumerate(raw_transcript, start=1):
+ frm_cap = frm_cap+' #'+str(idx)+'# '+line['text'].replace('\n',' ').replace('\n',' ')
+
+
+ dict_sentences = {}
+ sentences = frm_cap.strip().split('. ')
+ # small sentences that do not have an own frame are dropped
+ # sentences that are less than 20 letters large are dropped, too
+ # this is useful, so that lexrank does not picks the short sentences
+ for idx,item in enumerate(sentences):
+ m = re.search(r"#[^#]*#", item)
+ if m is not None:
+ match = m.group(0)
+ frm = match.replace('#','')
+ clean_match = re.sub('\s*#[^#]*#\s*',' ',item) + '.'
+ if len(clean_match) > 20:
+ dict_sentences[frm] = clean_match.strip()
+
+
+ return dict_sentences
+
+
+ # split all sentences into an array
+ # remove all timestamps in the middle of the sentences
+ # leave only the timestamps at the beginning of each sentence
+ # restore the full-stop sign at the end of each sentence, that was removed in the split step
+ #chops = ''
+ #for item in sl.strip().split('. '):
+ # chops = chops + re.sub('\s*#[^#]*#\s*',' ',item) + '. '
+ #chops
+
+ # remove all remaining hash tags
+ #dsl={}
+ #for item in chops.split('. #'):
+ # elem= item.split('# ')
+ # idx = elem[0].replace('#','')
+ # sentence = elem[1]+'.'
+ # dsl[idx] = sentence
+
+ #return dsl
+
+def convertToHTML(dsl):
+ workdir = 'file/workdir/'
+ cnt=1
+ html_rows = ''
+ html_rows = html_rows + 'Image Nr. | Timestamp [sec] | Image | Caption | '
+ for (key,val) in dsl.items():
+ image='frame_'+f"{int(cnt):04d}"+'.jpg'
+ sentence = val
+ row = '
'+str(cnt)+' | '
+ #row = row +''+f"{int(key):04d}"+' | '
+ row = row +''+key+' | '
+ row = row +' | '
+ row = row +''+sentence+' |
\n'
+ html_rows = html_rows + row
+ cnt = cnt+1
+ html_rows = html_rows + '
'
+
+
+ filename='./workdir/output.html'
+ with open(filename, 'w') as the_file:
+ the_file.write(html_rows)
+
+ return html_rows
+
+def getImages(dsl):
+ images = []
+ workdir = 'workdir/'
+ cnt=1
+ for (key,val) in dsl.items():
+ image='frame_'+f"{int(cnt):04d}"+'.jpg'
+ image_path = workdir+image
+ pil_im = Image.open(image_path)
+ images.append(pil_im)
+ cnt=cnt+1
+
+ return images
+
+
+# 1.
+# dict_sentences contains all sentences with the frame-nr
+# list_summary contains the summed sentences
+# the task is to find for all summarized sentences the corresponding frame-nr
+# 2.
+# dict_frame_timestamp contains a mapping of frames to the timestamps
+# 3.
+# it is used to construct the sum_timestamps list of the timestamps for each summarized sentence
+def getTimestampAtFrameFromSummary(raw_transcript, dict_sentences,list_summary):
+ dict_summary = {}
+ for key, value in dict_sentences.items():
+ for sentence in list_summary:
+ if str(sentence) in value:
+ dict_summary[key]=value
+
+ # sanity check, if the number of summarized sentences was found
+ if len(list_summary) != len(dict_summary):
+ err_msg = 'Error: Number of summarized sentences '+str(len(list_summary)) +' is not equal to the identified sentences '+str(len(dict_summary))+'.'
+ print(err_msg)
+ return err_msg
+
+ dict_frame_timestamp = {}
+ for (idx, line) in enumerate(raw_transcript, start=1):
+ dict_frame_timestamp[str(idx)] = str(line['start'])
+
+ sum_timestamps = []
+ for key in dict_summary.keys():
+ sum_timestamps.append(dict_frame_timestamp.get(key))
+
+ dict_timestamp_summary = {}
+ for (idx,value) in enumerate(list_summary):
+ timestamp = sum_timestamps[idx]
+ dict_timestamp_summary[timestamp] = str(value)
+
+ return dict_timestamp_summary
+
+
+def restore_cr(input_text, output_text):
+ # restore the carrige returns
+ srt_file = input_text
+ punctuated = output_text
+
+ srt_file_strip=srt_file.strip()
+ srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
+ srt_file_array=srt_file_sub.split(' ')
+ pcnt_file_array=punctuated.split(' ')
+
+ # goal: restore the break points i.e. the same number of lines as the srt file
+ # this is necessary, because each line in the srt file corresponds to a frame from the video
+ if len(srt_file_array)!=len(pcnt_file_array):
+ return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
+ pcnt_file_array_hash = []
+ for idx, item in enumerate(srt_file_array):
+ if item.endswith('#'):
+ pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
+ else:
+ pcnt_file_array_hash.append(pcnt_file_array[idx])
+
+ # assemble the array back to a string
+ pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')
+
+ return pcnt_file_cr
diff --git a/workdir/lion.jpg b/workdir/lion.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e9bf9f5d0816d6201b4862088dc74476249a6a70
Binary files /dev/null and b/workdir/lion.jpg differ
diff --git a/ytvideo.py b/ytvideo.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d3e57c11ab07e8bd4b51ebe7a4a4f22c539654a
--- /dev/null
+++ b/ytvideo.py
@@ -0,0 +1,54 @@
+from curses import error
+from pytube import YouTube
+import os
+
+def get_obj_from_link(link):
+ yt = YouTube(link)
+ return yt
+
+
+def get_filename_title(link):
+ if "v=" in link:
+ video_id = link.split("v=")[1]
+ else:
+ return "Error: Invalid Link, it does not have the pattern 'v=' in it."
+ yt = get_obj_from_link(link)
+ # catch urllib.error.URLError
+ try:
+ title=yt.title
+ except Exception as e:
+ error_msg = 'Error: Retreiving the video title failed.'
+ print(error_msg,e)
+ return e
+
+ # create title
+ import re
+ title_= re.sub(' ', '_', title)
+ filetitle=re.sub('[^0-9a-zA-Z_äüöß-]+','',title_)
+ filetitle_vid =filetitle+'='+video_id
+ return filetitle_vid
+
+def get_video(link):
+ yt = get_obj_from_link(link)
+ #yt = YouTube(link)
+ #filetitle = get_filename_title(yt)
+ filetitle = 'input_video'
+ # download video
+ working_directory = './workdir/'
+ filetitle_mp4=working_directory+filetitle+'.mp4'
+
+
+ # create a working directory for the files
+ if not os.path.isdir(working_directory):
+ print('There is no working directory. Create a new one.')
+ os.mkdir(working_directory)
+
+ # catch urllib.error.URLError
+ try:
+ result = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=filetitle_mp4)
+ except Exception as e:
+ error_msg = 'Error: Retreiving the video failed.'+result
+ print(error_msg,e)
+ return e
+
+ return result