File size: 8,512 Bytes
837fdb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
"""Implements a simple wrapper around urlopen."""
import http.client
import json
import logging
import re
import socket
from functools import lru_cache
from urllib import parse
from urllib.error import URLError
from urllib.request import Request, urlopen
from pytube.exceptions import RegexMatchError, MaxRetriesExceeded
from pytube.helpers import regex_search
logger = logging.getLogger(__name__)
default_range_size = 9437184 # 9MB
def _execute_request(
url,
method=None,
headers=None,
data=None,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT
):
base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
if headers:
base_headers.update(headers)
if data:
# encode data for request
if not isinstance(data, bytes):
data = bytes(json.dumps(data), encoding="utf-8")
if url.lower().startswith("http"):
request = Request(url, headers=base_headers, method=method, data=data)
else:
raise ValueError("Invalid URL")
return urlopen(request, timeout=timeout) # nosec
def get(url, extra_headers=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
"""Send an http GET request.
:param str url:
The URL to perform the GET request for.
:param dict extra_headers:
Extra headers to add to the request
:rtype: str
:returns:
UTF-8 encoded string of response
"""
if extra_headers is None:
extra_headers = {}
response = _execute_request(url, headers=extra_headers, timeout=timeout)
return response.read().decode("utf-8")
def post(url, extra_headers=None, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
"""Send an http POST request.
:param str url:
The URL to perform the POST request for.
:param dict extra_headers:
Extra headers to add to the request
:param dict data:
The data to send on the POST request
:rtype: str
:returns:
UTF-8 encoded string of response
"""
# could technically be implemented in get,
# but to avoid confusion implemented like this
if extra_headers is None:
extra_headers = {}
if data is None:
data = {}
# required because the youtube servers are strict on content type
# raises HTTPError [400]: Bad Request otherwise
extra_headers.update({"Content-Type": "application/json"})
response = _execute_request(
url,
headers=extra_headers,
data=data,
timeout=timeout
)
return response.read().decode("utf-8")
def seq_stream(
url,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
max_retries=0
):
"""Read the response in sequence.
:param str url: The URL to perform the GET request for.
:rtype: Iterable[bytes]
"""
# YouTube expects a request sequence number as part of the parameters.
split_url = parse.urlsplit(url)
base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
querys = dict(parse.parse_qsl(split_url.query))
# The 0th sequential request provides the file headers, which tell us
# information about how the file is segmented.
querys['sq'] = 0
url = base_url + parse.urlencode(querys)
segment_data = b''
for chunk in stream(url, timeout=timeout, max_retries=max_retries):
yield chunk
segment_data += chunk
# We can then parse the header to find the number of segments
stream_info = segment_data.split(b'\r\n')
segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
for line in stream_info:
match = segment_count_pattern.search(line)
if match:
segment_count = int(match.group(1).decode('utf-8'))
# We request these segments sequentially to build the file.
seq_num = 1
while seq_num <= segment_count:
# Create sequential request URL
querys['sq'] = seq_num
url = base_url + parse.urlencode(querys)
yield from stream(url, timeout=timeout, max_retries=max_retries)
seq_num += 1
return # pylint: disable=R1711
def stream(
url,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
max_retries=0
):
"""Read the response in chunks.
:param str url: The URL to perform the GET request for.
:rtype: Iterable[bytes]
"""
file_size: int = default_range_size # fake filesize to start
downloaded = 0
while downloaded < file_size:
stop_pos = min(downloaded + default_range_size, file_size) - 1
range_header = f"bytes={downloaded}-{stop_pos}"
tries = 0
# Attempt to make the request multiple times as necessary.
while True:
# If the max retries is exceeded, raise an exception
if tries >= 1 + max_retries:
raise MaxRetriesExceeded()
# Try to execute the request, ignoring socket timeouts
try:
response = _execute_request(
url,
method="GET",
headers={"Range": range_header},
timeout=timeout
)
except URLError as e:
# We only want to skip over timeout errors, and
# raise any other URLError exceptions
if isinstance(e.reason, socket.timeout):
pass
else:
raise
except http.client.IncompleteRead:
# Allow retries on IncompleteRead errors for unreliable connections
pass
else:
# On a successful request, break from loop
break
tries += 1
if file_size == default_range_size:
try:
content_range = response.info()["Content-Range"]
file_size = int(content_range.split("/")[1])
except (KeyError, IndexError, ValueError) as e:
logger.error(e)
while True:
chunk = response.read()
if not chunk:
break
downloaded += len(chunk)
yield chunk
return # pylint: disable=R1711
@lru_cache()
def filesize(url):
"""Fetch size in bytes of file at given URL
:param str url: The URL to get the size of
:returns: int: size in bytes of remote file
"""
return int(head(url)["content-length"])
@lru_cache()
def seq_filesize(url):
"""Fetch size in bytes of file at given URL from sequential requests
:param str url: The URL to get the size of
:returns: int: size in bytes of remote file
"""
total_filesize = 0
# YouTube expects a request sequence number as part of the parameters.
split_url = parse.urlsplit(url)
base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
querys = dict(parse.parse_qsl(split_url.query))
# The 0th sequential request provides the file headers, which tell us
# information about how the file is segmented.
querys['sq'] = 0
url = base_url + parse.urlencode(querys)
response = _execute_request(
url, method="GET"
)
response_value = response.read()
# The file header must be added to the total filesize
total_filesize += len(response_value)
# We can then parse the header to find the number of segments
segment_count = 0
stream_info = response_value.split(b'\r\n')
segment_regex = b'Segment-Count: (\\d+)'
for line in stream_info:
# One of the lines should contain the segment count, but we don't know
# which, so we need to iterate through the lines to find it
try:
segment_count = int(regex_search(segment_regex, line, 1))
except RegexMatchError:
pass
if segment_count == 0:
raise RegexMatchError('seq_filesize', segment_regex)
# We make HEAD requests to the segments sequentially to find the total filesize.
seq_num = 1
while seq_num <= segment_count:
# Create sequential request URL
querys['sq'] = seq_num
url = base_url + parse.urlencode(querys)
total_filesize += int(head(url)['content-length'])
seq_num += 1
return total_filesize
def head(url):
"""Fetch headers returned http GET request.
:param str url:
The URL to perform the GET request for.
:rtype: dict
:returns:
dictionary of lowercase headers
"""
response_headers = _execute_request(url, method="HEAD").info()
return {k.lower(): v for k, v in response_headers.items()}
|