Taylor Fox Dahlin commited on
Commit
6e4ef92
·
unverified ·
1 Parent(s): 0217809

Improvement/parser (#838)

Browse files

* Added a parser for javascript objects within html

pytube/__main__.py CHANGED
@@ -80,7 +80,6 @@ class YouTube:
80
 
81
  self.fmt_streams: List[Stream] = []
82
 
83
- self.initial_data_raw = None
84
  self.initial_data = {}
85
  self._metadata: Optional[YouTubeMetadata] = None
86
 
@@ -190,8 +189,7 @@ class YouTube:
190
  video_id=self.video_id, watch_url=self.watch_url
191
  )
192
 
193
- self.initial_data_raw = extract.initial_data(self.watch_html)
194
- self.initial_data = json.loads(self.initial_data_raw)
195
 
196
  self.vid_info_raw = request.get(self.vid_info_url)
197
  if not self.age_restricted:
 
80
 
81
  self.fmt_streams: List[Stream] = []
82
 
 
83
  self.initial_data = {}
84
  self._metadata: Optional[YouTubeMetadata] = None
85
 
 
189
  video_id=self.video_id, watch_url=self.watch_url
190
  )
191
 
192
+ self.initial_data = extract.initial_data(self.watch_html)
 
193
 
194
  self.vid_info_raw = request.get(self.vid_info_url)
195
  if not self.age_restricted:
pytube/contrib/playlist.py CHANGED
@@ -51,7 +51,7 @@ class Playlist(Sequence):
51
  """
52
  req = self.html
53
  videos_urls, continuation = self._extract_videos(
54
- extract.initial_data(self.html)
55
  )
56
  if until_watch_id:
57
  try:
 
51
  """
52
  req = self.html
53
  videos_urls, continuation = self._extract_videos(
54
+ json.dumps(extract.initial_data(self.html))
55
  )
56
  if until_watch_id:
57
  try:
pytube/extract.py CHANGED
@@ -18,10 +18,12 @@ from urllib.parse import unquote
18
  from urllib.parse import urlencode
19
 
20
  from pytube.cipher import Cipher
 
21
  from pytube.exceptions import LiveStreamError
22
  from pytube.exceptions import RegexMatchError
23
  from pytube.helpers import regex_search
24
  from pytube.metadata import YouTubeMetadata
 
25
 
26
  logger = logging.getLogger(__name__)
27
 
@@ -269,31 +271,29 @@ def get_ytplayer_config(html: str) -> Any:
269
  """
270
  logger.debug("finding initial function name")
271
  config_patterns = [
272
- r"ytplayer\.config\s*=\s*({.+?});ytplayer",
273
- r"ytInitialPlayerResponse\s*=\s*({.+?(?<!gdpr)});"
274
  ]
275
  for pattern in config_patterns:
276
- regex = re.compile(pattern)
277
- function_match = regex.search(html)
278
- if function_match:
279
- logger.debug("finished regex search, matched: %s", pattern)
280
- yt_player_config = function_match.group(1)
281
- return json.loads(yt_player_config)
282
 
283
  # setConfig() needs to be handled a little differently.
284
  # We want to parse the entire argument to setConfig()
285
  # and use then load that as json to find PLAYER_CONFIG
286
  # inside of it.
287
  setconfig_patterns = [
288
- r"yt\.setConfig\((.*['\"]PLAYER_CONFIG['\"]:\s*{.+?})\);"
289
  ]
290
  for pattern in setconfig_patterns:
291
- regex = re.compile(pattern)
292
- function_match = regex.search(html)
293
- if function_match:
294
- logger.debug("finished regex search, matched: %s", pattern)
295
- yt_config = function_match.group(1)
296
- return json.loads(yt_config)['PLAYER_CONFIG']
297
 
298
  raise RegexMatchError(
299
  caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
@@ -431,11 +431,11 @@ def initial_data(watch_html: str) -> str:
431
  @param watch_html: Html of the watch page
432
  @return:
433
  """
434
- initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+);"
435
  try:
436
- return regex_search(initial_data_pattern, watch_html, 1)
437
- except RegexMatchError:
438
- return "{}"
439
 
440
 
441
  def metadata(initial_data) -> Optional[YouTubeMetadata]:
 
18
  from urllib.parse import urlencode
19
 
20
  from pytube.cipher import Cipher
21
+ from pytube.exceptions import HTMLParseError
22
  from pytube.exceptions import LiveStreamError
23
  from pytube.exceptions import RegexMatchError
24
  from pytube.helpers import regex_search
25
  from pytube.metadata import YouTubeMetadata
26
+ from pytube.parser import parse_for_object
27
 
28
  logger = logging.getLogger(__name__)
29
 
 
271
  """
272
  logger.debug("finding initial function name")
273
  config_patterns = [
274
+ r"ytplayer\.config\s*=\s*",
275
+ r"ytInitialPlayerResponse\s*=\s*"
276
  ]
277
  for pattern in config_patterns:
278
+ # Try each pattern consecutively if they don't find a match
279
+ try:
280
+ return parse_for_object(html, pattern)
281
+ except HTMLParseError:
282
+ continue
 
283
 
284
  # setConfig() needs to be handled a little differently.
285
  # We want to parse the entire argument to setConfig()
286
  # and use then load that as json to find PLAYER_CONFIG
287
  # inside of it.
288
  setconfig_patterns = [
289
+ r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*"
290
  ]
291
  for pattern in setconfig_patterns:
292
+ # Try each pattern consecutively if they don't find a match
293
+ try:
294
+ return parse_for_object(html, pattern)
295
+ except HTMLParseError:
296
+ continue
 
297
 
298
  raise RegexMatchError(
299
  caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns"
 
431
  @param watch_html: Html of the watch page
432
  @return:
433
  """
434
+ initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*"
435
  try:
436
+ return parse_for_object(watch_html, initial_data_pattern)
437
+ except HTMLParseError:
438
+ return {}
439
 
440
 
441
  def metadata(initial_data) -> Optional[YouTubeMetadata]:
pytube/parser.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+ import re
4
+ from pytube.exceptions import HTMLParseError
5
+
6
+
7
+ def parse_for_object(html, preceding_regex):
8
+ """Parses input html to find the end of a JavaScript object.
9
+
10
+ :param str html:
11
+ HTML to be parsed for an object.
12
+ :param str preceding_regex:
13
+ Regex to find the string preceding the object.
14
+ :rtype dict:
15
+ :returns:
16
+ A dict created from parsing the object.
17
+ """
18
+ regex = re.compile(preceding_regex)
19
+ result = regex.search(html)
20
+ if not result:
21
+ raise HTMLParseError(f'No matches for regex {preceding_regex}')
22
+
23
+ start_index = result.span()[1]
24
+ return parse_for_object_from_startpoint(html, start_index)
25
+
26
+
27
+ def parse_for_object_from_startpoint(html, start_point):
28
+ """Parses input html to find the end of a JavaScript object.
29
+
30
+ :param str html:
31
+ HTML to be parsed for an object.
32
+ :param int start_point:
33
+ Index of where the object starts.
34
+ :rtype dict:
35
+ :returns:
36
+ A dict created from parsing the object.
37
+ """
38
+ html = html[start_point:]
39
+ if html[0] != '{':
40
+ raise HTMLParseError('Invalid start point.')
41
+
42
+ # First letter MUST be a open brace, so we put that in the stack,
43
+ # and skip the first character.
44
+ stack = ['{']
45
+ i = 1
46
+
47
+ context_closers = {
48
+ '{': '}',
49
+ '[': ']',
50
+ '"': '"'
51
+ }
52
+
53
+ while i < len(html):
54
+ if len(stack) == 0:
55
+ break
56
+ curr_char = html[i]
57
+ curr_context = stack[-1]
58
+
59
+ # If we've reached a context closer, we can remove an element off the stack
60
+ if curr_char == context_closers[curr_context]:
61
+ stack.pop()
62
+ i += 1
63
+ continue
64
+
65
+ # Strings require special context handling because they can contain
66
+ # context openers *and* closers
67
+ if curr_context == '"':
68
+ # If there's a backslash in a string, we skip a character
69
+ if curr_char == '\\':
70
+ i += 2
71
+ continue
72
+ else:
73
+ # Non-string contexts are when we need to look for context openers.
74
+ if curr_char in context_closers.keys():
75
+ stack.append(curr_char)
76
+
77
+ i += 1
78
+
79
+ full_obj = html[:i]
80
+ try:
81
+ return json.loads(full_obj)
82
+ except json.decoder.JSONDecodeError:
83
+ return ast.literal_eval(full_obj)
tests/test_extract.py CHANGED
@@ -106,7 +106,7 @@ def test_signature_cipher_does_not_error(stream_dict):
106
 
107
  def test_initial_data_missing():
108
  initial_data = extract.initial_data('')
109
- assert initial_data == "{}"
110
 
111
 
112
  def test_initial_data(stream_dict):
 
106
 
107
  def test_initial_data_missing():
108
  initial_data = extract.initial_data('')
109
+ assert initial_data == {}
110
 
111
 
112
  def test_initial_data(stream_dict):
tests/test_metadata.py CHANGED
@@ -1,6 +1,5 @@
1
  # -*- coding: utf-8 -*-
2
  """Unit tests for the :module:`metadata <metadata>` module."""
3
- import json
4
  from pytube import extract
5
 
6
 
@@ -11,7 +10,7 @@ def test_extract_metadata_empty():
11
 
12
  def test_metadata_from_initial_data(stream_dict):
13
  initial_data = extract.initial_data(stream_dict)
14
- ytmd = extract.metadata(json.loads(initial_data))
15
  assert len(ytmd.raw_metadata) > 0
16
  assert 'contents' in ytmd.raw_metadata[0]
17
  assert len(ytmd.metadata) > 0
 
1
  # -*- coding: utf-8 -*-
2
  """Unit tests for the :module:`metadata <metadata>` module."""
 
3
  from pytube import extract
4
 
5
 
 
10
 
11
  def test_metadata_from_initial_data(stream_dict):
12
  initial_data = extract.initial_data(stream_dict)
13
+ ytmd = extract.metadata(initial_data)
14
  assert len(ytmd.raw_metadata) > 0
15
  assert 'contents' in ytmd.raw_metadata[0]
16
  assert len(ytmd.metadata) > 0
tests/test_parser.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pytest
3
+
4
+ from pytube.exceptions import HTMLParseError
5
+ from pytube.parser import parse_for_object
6
+
7
+
8
+ def test_invalid_start():
9
+ with pytest.raises(HTMLParseError):
10
+ parse_for_object('test = {}', r'invalid_regex')
11
+
12
+
13
+ def test_parse_simple_empty_object():
14
+ result = parse_for_object('test = {}', r'test\s*=\s*')
15
+ assert result == {}
16
+
17
+
18
+ def test_parse_longer_empty_object():
19
+ test_html = """test = {
20
+
21
+
22
+ }"""
23
+ result = parse_for_object(test_html, r'test\s*=\s*')
24
+ assert result == {}
25
+
26
+
27
+ def test_parse_empty_object_with_trailing_characters():
28
+ test_html = 'test = {};'
29
+ result = parse_for_object(test_html, r'test\s*=\s*')
30
+ assert result == {}
31
+
32
+
33
+ def test_parse_simple_object():
34
+ test_html = 'test = {"foo": [], "bar": {}};'
35
+ result = parse_for_object(test_html, r'test\s*=\s*')
36
+ assert result == {
37
+ 'foo': [],
38
+ 'bar': {}
39
+ }
40
+
41
+
42
+ def test_parse_context_closer_in_string_value():
43
+ test_html = 'test = {"foo": "};"};'
44
+ result = parse_for_object(test_html, r'test\s*=\s*')
45
+ assert result == {
46
+ 'foo': '};'
47
+ }
48
+
49
+
50
+ def test_parse_object_requiring_ast():
51
+ invalid_json = '{"foo": "bar",}'
52
+ test_html = f'test = {invalid_json}'
53
+ with pytest.raises(json.decoder.JSONDecodeError):
54
+ json.loads(invalid_json)
55
+ result = parse_for_object(test_html, r'test\s*=\s*')
56
+ assert result == {
57
+ 'foo': 'bar'
58
+ }