Taylor Fox Dahlin
commited on
Improvement/parser (#838)
Browse files* Added a parser for javascript objects within html
- pytube/__main__.py +1 -3
- pytube/contrib/playlist.py +1 -1
- pytube/extract.py +19 -19
- pytube/parser.py +83 -0
- tests/test_extract.py +1 -1
- tests/test_metadata.py +1 -2
- tests/test_parser.py +58 -0
pytube/__main__.py
CHANGED
@@ -80,7 +80,6 @@ class YouTube:
|
|
80 |
|
81 |
self.fmt_streams: List[Stream] = []
|
82 |
|
83 |
-
self.initial_data_raw = None
|
84 |
self.initial_data = {}
|
85 |
self._metadata: Optional[YouTubeMetadata] = None
|
86 |
|
@@ -190,8 +189,7 @@ class YouTube:
|
|
190 |
video_id=self.video_id, watch_url=self.watch_url
|
191 |
)
|
192 |
|
193 |
-
self.
|
194 |
-
self.initial_data = json.loads(self.initial_data_raw)
|
195 |
|
196 |
self.vid_info_raw = request.get(self.vid_info_url)
|
197 |
if not self.age_restricted:
|
|
|
80 |
|
81 |
self.fmt_streams: List[Stream] = []
|
82 |
|
|
|
83 |
self.initial_data = {}
|
84 |
self._metadata: Optional[YouTubeMetadata] = None
|
85 |
|
|
|
189 |
video_id=self.video_id, watch_url=self.watch_url
|
190 |
)
|
191 |
|
192 |
+
self.initial_data = extract.initial_data(self.watch_html)
|
|
|
193 |
|
194 |
self.vid_info_raw = request.get(self.vid_info_url)
|
195 |
if not self.age_restricted:
|
pytube/contrib/playlist.py
CHANGED
@@ -51,7 +51,7 @@ class Playlist(Sequence):
|
|
51 |
"""
|
52 |
req = self.html
|
53 |
videos_urls, continuation = self._extract_videos(
|
54 |
-
extract.initial_data(self.html)
|
55 |
)
|
56 |
if until_watch_id:
|
57 |
try:
|
|
|
51 |
"""
|
52 |
req = self.html
|
53 |
videos_urls, continuation = self._extract_videos(
|
54 |
+
json.dumps(extract.initial_data(self.html))
|
55 |
)
|
56 |
if until_watch_id:
|
57 |
try:
|
pytube/extract.py
CHANGED
@@ -18,10 +18,12 @@ from urllib.parse import unquote
|
|
18 |
from urllib.parse import urlencode
|
19 |
|
20 |
from pytube.cipher import Cipher
|
|
|
21 |
from pytube.exceptions import LiveStreamError
|
22 |
from pytube.exceptions import RegexMatchError
|
23 |
from pytube.helpers import regex_search
|
24 |
from pytube.metadata import YouTubeMetadata
|
|
|
25 |
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
@@ -269,31 +271,29 @@ def get_ytplayer_config(html: str) -> Any:
|
|
269 |
"""
|
270 |
logger.debug("finding initial function name")
|
271 |
config_patterns = [
|
272 |
-
r"ytplayer\.config\s*=\s*
|
273 |
-
r"ytInitialPlayerResponse\s*=\s*
|
274 |
]
|
275 |
for pattern in config_patterns:
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
return json.loads(yt_player_config)
|
282 |
|
283 |
# setConfig() needs to be handled a little differently.
|
284 |
# We want to parse the entire argument to setConfig()
|
285 |
# and use then load that as json to find PLAYER_CONFIG
|
286 |
# inside of it.
|
287 |
setconfig_patterns = [
|
288 |
-
r"yt\.setConfig\(
|
289 |
]
|
290 |
for pattern in setconfig_patterns:
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
return json.loads(yt_config)['PLAYER_CONFIG']
|
297 |
|
298 |
raise RegexMatchError(
|
299 |
caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
|
@@ -431,11 +431,11 @@ def initial_data(watch_html: str) -> str:
|
|
431 |
@param watch_html: Html of the watch page
|
432 |
@return:
|
433 |
"""
|
434 |
-
initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*
|
435 |
try:
|
436 |
-
return
|
437 |
-
except
|
438 |
-
return
|
439 |
|
440 |
|
441 |
def metadata(initial_data) -> Optional[YouTubeMetadata]:
|
|
|
18 |
from urllib.parse import urlencode
|
19 |
|
20 |
from pytube.cipher import Cipher
|
21 |
+
from pytube.exceptions import HTMLParseError
|
22 |
from pytube.exceptions import LiveStreamError
|
23 |
from pytube.exceptions import RegexMatchError
|
24 |
from pytube.helpers import regex_search
|
25 |
from pytube.metadata import YouTubeMetadata
|
26 |
+
from pytube.parser import parse_for_object
|
27 |
|
28 |
logger = logging.getLogger(__name__)
|
29 |
|
|
|
271 |
"""
|
272 |
logger.debug("finding initial function name")
|
273 |
config_patterns = [
|
274 |
+
r"ytplayer\.config\s*=\s*",
|
275 |
+
r"ytInitialPlayerResponse\s*=\s*"
|
276 |
]
|
277 |
for pattern in config_patterns:
|
278 |
+
# Try each pattern consecutively if they don't find a match
|
279 |
+
try:
|
280 |
+
return parse_for_object(html, pattern)
|
281 |
+
except HTMLParseError:
|
282 |
+
continue
|
|
|
283 |
|
284 |
# setConfig() needs to be handled a little differently.
|
285 |
# We want to parse the entire argument to setConfig()
|
286 |
# and use then load that as json to find PLAYER_CONFIG
|
287 |
# inside of it.
|
288 |
setconfig_patterns = [
|
289 |
+
r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*"
|
290 |
]
|
291 |
for pattern in setconfig_patterns:
|
292 |
+
# Try each pattern consecutively if they don't find a match
|
293 |
+
try:
|
294 |
+
return parse_for_object(html, pattern)
|
295 |
+
except HTMLParseError:
|
296 |
+
continue
|
|
|
297 |
|
298 |
raise RegexMatchError(
|
299 |
caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
|
|
|
431 |
@param watch_html: Html of the watch page
|
432 |
@return:
|
433 |
"""
|
434 |
+
initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*"
|
435 |
try:
|
436 |
+
return parse_for_object(watch_html, initial_data_pattern)
|
437 |
+
except HTMLParseError:
|
438 |
+
return {}
|
439 |
|
440 |
|
441 |
def metadata(initial_data) -> Optional[YouTubeMetadata]:
|
pytube/parser.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from pytube.exceptions import HTMLParseError
|
5 |
+
|
6 |
+
|
7 |
+
def parse_for_object(html, preceding_regex):
|
8 |
+
"""Parses input html to find the end of a JavaScript object.
|
9 |
+
|
10 |
+
:param str html:
|
11 |
+
HTML to be parsed for an object.
|
12 |
+
:param str preceding_regex:
|
13 |
+
Regex to find the string preceding the object.
|
14 |
+
:rtype dict:
|
15 |
+
:returns:
|
16 |
+
A dict created from parsing the object.
|
17 |
+
"""
|
18 |
+
regex = re.compile(preceding_regex)
|
19 |
+
result = regex.search(html)
|
20 |
+
if not result:
|
21 |
+
raise HTMLParseError(f'No matches for regex {preceding_regex}')
|
22 |
+
|
23 |
+
start_index = result.span()[1]
|
24 |
+
return parse_for_object_from_startpoint(html, start_index)
|
25 |
+
|
26 |
+
|
27 |
+
def parse_for_object_from_startpoint(html, start_point):
|
28 |
+
"""Parses input html to find the end of a JavaScript object.
|
29 |
+
|
30 |
+
:param str html:
|
31 |
+
HTML to be parsed for an object.
|
32 |
+
:param int start_point:
|
33 |
+
Index of where the object starts.
|
34 |
+
:rtype dict:
|
35 |
+
:returns:
|
36 |
+
A dict created from parsing the object.
|
37 |
+
"""
|
38 |
+
html = html[start_point:]
|
39 |
+
if html[0] != '{':
|
40 |
+
raise HTMLParseError('Invalid start point.')
|
41 |
+
|
42 |
+
# First letter MUST be a open brace, so we put that in the stack,
|
43 |
+
# and skip the first character.
|
44 |
+
stack = ['{']
|
45 |
+
i = 1
|
46 |
+
|
47 |
+
context_closers = {
|
48 |
+
'{': '}',
|
49 |
+
'[': ']',
|
50 |
+
'"': '"'
|
51 |
+
}
|
52 |
+
|
53 |
+
while i < len(html):
|
54 |
+
if len(stack) == 0:
|
55 |
+
break
|
56 |
+
curr_char = html[i]
|
57 |
+
curr_context = stack[-1]
|
58 |
+
|
59 |
+
# If we've reached a context closer, we can remove an element off the stack
|
60 |
+
if curr_char == context_closers[curr_context]:
|
61 |
+
stack.pop()
|
62 |
+
i += 1
|
63 |
+
continue
|
64 |
+
|
65 |
+
# Strings require special context handling because they can contain
|
66 |
+
# context openers *and* closers
|
67 |
+
if curr_context == '"':
|
68 |
+
# If there's a backslash in a string, we skip a character
|
69 |
+
if curr_char == '\\':
|
70 |
+
i += 2
|
71 |
+
continue
|
72 |
+
else:
|
73 |
+
# Non-string contexts are when we need to look for context openers.
|
74 |
+
if curr_char in context_closers.keys():
|
75 |
+
stack.append(curr_char)
|
76 |
+
|
77 |
+
i += 1
|
78 |
+
|
79 |
+
full_obj = html[:i]
|
80 |
+
try:
|
81 |
+
return json.loads(full_obj)
|
82 |
+
except json.decoder.JSONDecodeError:
|
83 |
+
return ast.literal_eval(full_obj)
|
tests/test_extract.py
CHANGED
@@ -106,7 +106,7 @@ def test_signature_cipher_does_not_error(stream_dict):
|
|
106 |
|
107 |
def test_initial_data_missing():
|
108 |
initial_data = extract.initial_data('')
|
109 |
-
assert initial_data ==
|
110 |
|
111 |
|
112 |
def test_initial_data(stream_dict):
|
|
|
106 |
|
107 |
def test_initial_data_missing():
|
108 |
initial_data = extract.initial_data('')
|
109 |
+
assert initial_data == {}
|
110 |
|
111 |
|
112 |
def test_initial_data(stream_dict):
|
tests/test_metadata.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""Unit tests for the :module:`metadata <metadata>` module."""
|
3 |
-
import json
|
4 |
from pytube import extract
|
5 |
|
6 |
|
@@ -11,7 +10,7 @@ def test_extract_metadata_empty():
|
|
11 |
|
12 |
def test_metadata_from_initial_data(stream_dict):
|
13 |
initial_data = extract.initial_data(stream_dict)
|
14 |
-
ytmd = extract.metadata(
|
15 |
assert len(ytmd.raw_metadata) > 0
|
16 |
assert 'contents' in ytmd.raw_metadata[0]
|
17 |
assert len(ytmd.metadata) > 0
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""Unit tests for the :module:`metadata <metadata>` module."""
|
|
|
3 |
from pytube import extract
|
4 |
|
5 |
|
|
|
10 |
|
11 |
def test_metadata_from_initial_data(stream_dict):
|
12 |
initial_data = extract.initial_data(stream_dict)
|
13 |
+
ytmd = extract.metadata(initial_data)
|
14 |
assert len(ytmd.raw_metadata) > 0
|
15 |
assert 'contents' in ytmd.raw_metadata[0]
|
16 |
assert len(ytmd.metadata) > 0
|
tests/test_parser.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pytest
|
3 |
+
|
4 |
+
from pytube.exceptions import HTMLParseError
|
5 |
+
from pytube.parser import parse_for_object
|
6 |
+
|
7 |
+
|
8 |
+
def test_invalid_start():
|
9 |
+
with pytest.raises(HTMLParseError):
|
10 |
+
parse_for_object('test = {}', r'invalid_regex')
|
11 |
+
|
12 |
+
|
13 |
+
def test_parse_simple_empty_object():
|
14 |
+
result = parse_for_object('test = {}', r'test\s*=\s*')
|
15 |
+
assert result == {}
|
16 |
+
|
17 |
+
|
18 |
+
def test_parse_longer_empty_object():
|
19 |
+
test_html = """test = {
|
20 |
+
|
21 |
+
|
22 |
+
}"""
|
23 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
24 |
+
assert result == {}
|
25 |
+
|
26 |
+
|
27 |
+
def test_parse_empty_object_with_trailing_characters():
|
28 |
+
test_html = 'test = {};'
|
29 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
30 |
+
assert result == {}
|
31 |
+
|
32 |
+
|
33 |
+
def test_parse_simple_object():
|
34 |
+
test_html = 'test = {"foo": [], "bar": {}};'
|
35 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
36 |
+
assert result == {
|
37 |
+
'foo': [],
|
38 |
+
'bar': {}
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
def test_parse_context_closer_in_string_value():
|
43 |
+
test_html = 'test = {"foo": "};"};'
|
44 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
45 |
+
assert result == {
|
46 |
+
'foo': '};'
|
47 |
+
}
|
48 |
+
|
49 |
+
|
50 |
+
def test_parse_object_requiring_ast():
|
51 |
+
invalid_json = '{"foo": "bar",}'
|
52 |
+
test_html = f'test = {invalid_json}'
|
53 |
+
with pytest.raises(json.decoder.JSONDecodeError):
|
54 |
+
json.loads(invalid_json)
|
55 |
+
result = parse_for_object(test_html, r'test\s*=\s*')
|
56 |
+
assert result == {
|
57 |
+
'foo': 'bar'
|
58 |
+
}
|