tubifier / pytube /parser.py
wldmr's picture
app file
history blame
No virus
5.53 kB
import ast
import json
import re
from pytube.exceptions import HTMLParseError
def parse_for_all_objects(html, preceding_regex):
"""Parses input html to find all matches for the input starting point.
:param str html:
HTML to be parsed for an object.
:param str preceding_regex:
Regex to find the string preceding the object.
:rtype list:
A list of dicts created from parsing the objects.
result = []
regex = re.compile(preceding_regex)
match_iter = regex.finditer(html)
for match in match_iter:
if match:
start_index = match.end()
obj = parse_for_object_from_startpoint(html, start_index)
except HTMLParseError:
# Some of the instances might fail because set is technically
# a method of the ytcfg object. We'll skip these since they
# don't seem relevant at the moment.
if len(result) == 0:
raise HTMLParseError(f'No matches for regex {preceding_regex}')
return result
def parse_for_object(html, preceding_regex):
"""Parses input html to find the end of a JavaScript object.
:param str html:
HTML to be parsed for an object.
:param str preceding_regex:
Regex to find the string preceding the object.
:rtype dict:
A dict created from parsing the object.
regex = re.compile(preceding_regex)
result = regex.search(html)
if not result:
raise HTMLParseError(f'No matches for regex {preceding_regex}')
start_index = result.end()
return parse_for_object_from_startpoint(html, start_index)
def find_object_from_startpoint(html, start_point):
"""Parses input html to find the end of a JavaScript object.
:param str html:
HTML to be parsed for an object.
:param int start_point:
Index of where the object starts.
:rtype dict:
A dict created from parsing the object.
html = html[start_point:]
if html[0] not in ['{','[']:
raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}')
# First letter MUST be a open brace, so we put that in the stack,
# and skip the first character.
stack = [html[0]]
i = 1
context_closers = {
'{': '}',
'[': ']',
'"': '"'
while i < len(html):
if len(stack) == 0:
curr_char = html[i]
curr_context = stack[-1]
# If we've reached a context closer, we can remove an element off the stack
if curr_char == context_closers[curr_context]:
i += 1
# Strings require special context handling because they can contain
# context openers *and* closers
if curr_context == '"':
# If there's a backslash in a string, we skip a character
if curr_char == '\\':
i += 2
# Non-string contexts are when we need to look for context openers.
if curr_char in context_closers.keys():
i += 1
full_obj = html[:i]
return full_obj # noqa: R504
def parse_for_object_from_startpoint(html, start_point):
"""JSONifies an object parsed from HTML.
:param str html:
HTML to be parsed for an object.
:param int start_point:
Index of where the object starts.
:rtype dict:
A dict created from parsing the object.
full_obj = find_object_from_startpoint(html, start_point)
return json.loads(full_obj)
except json.decoder.JSONDecodeError:
return ast.literal_eval(full_obj)
except (ValueError, SyntaxError):
raise HTMLParseError('Could not parse object.')
def throttling_array_split(js_array):
"""Parses the throttling array into a python list of strings.
Expects input to begin with `[` and close with `]`.
:param str js_array:
The javascript array, as a string.
:rtype: list:
A list of strings representing splits on `,` in the throttling array.
results = []
curr_substring = js_array[1:]
comma_regex = re.compile(r",")
func_regex = re.compile(r"function\([^)]*\)")
while len(curr_substring) > 0:
if curr_substring.startswith('function'):
# Handle functions separately. These can contain commas
match = func_regex.search(curr_substring)
match_start, match_end = match.span()
function_text = find_object_from_startpoint(curr_substring, match.span()[1])
full_function_def = curr_substring[:match_end + len(function_text)]
curr_substring = curr_substring[len(full_function_def) + 1:]
match = comma_regex.search(curr_substring)
# Try-catch to capture end of array
match_start, match_end = match.span()
except AttributeError:
match_start = len(curr_substring) - 1
match_end = match_start + 1
curr_el = curr_substring[:match_start]
curr_substring = curr_substring[match_end:]
return results