File size: 5,526 Bytes
837fdb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import ast
import json
import re
from pytube.exceptions import HTMLParseError
def parse_for_all_objects(html, preceding_regex):
"""Parses input html to find all matches for the input starting point.
:param str html:
HTML to be parsed for an object.
:param str preceding_regex:
Regex to find the string preceding the object.
:rtype list:
:returns:
A list of dicts created from parsing the objects.
"""
result = []
regex = re.compile(preceding_regex)
match_iter = regex.finditer(html)
for match in match_iter:
if match:
start_index = match.end()
try:
obj = parse_for_object_from_startpoint(html, start_index)
except HTMLParseError:
# Some of the instances might fail because set is technically
# a method of the ytcfg object. We'll skip these since they
# don't seem relevant at the moment.
continue
else:
result.append(obj)
if len(result) == 0:
raise HTMLParseError(f'No matches for regex {preceding_regex}')
return result
def parse_for_object(html, preceding_regex):
"""Parses input html to find the end of a JavaScript object.
:param str html:
HTML to be parsed for an object.
:param str preceding_regex:
Regex to find the string preceding the object.
:rtype dict:
:returns:
A dict created from parsing the object.
"""
regex = re.compile(preceding_regex)
result = regex.search(html)
if not result:
raise HTMLParseError(f'No matches for regex {preceding_regex}')
start_index = result.end()
return parse_for_object_from_startpoint(html, start_index)
def find_object_from_startpoint(html, start_point):
"""Parses input html to find the end of a JavaScript object.
:param str html:
HTML to be parsed for an object.
:param int start_point:
Index of where the object starts.
:rtype dict:
:returns:
A dict created from parsing the object.
"""
html = html[start_point:]
if html[0] not in ['{','[']:
raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}')
# First letter MUST be a open brace, so we put that in the stack,
# and skip the first character.
stack = [html[0]]
i = 1
context_closers = {
'{': '}',
'[': ']',
'"': '"'
}
while i < len(html):
if len(stack) == 0:
break
curr_char = html[i]
curr_context = stack[-1]
# If we've reached a context closer, we can remove an element off the stack
if curr_char == context_closers[curr_context]:
stack.pop()
i += 1
continue
# Strings require special context handling because they can contain
# context openers *and* closers
if curr_context == '"':
# If there's a backslash in a string, we skip a character
if curr_char == '\\':
i += 2
continue
else:
# Non-string contexts are when we need to look for context openers.
if curr_char in context_closers.keys():
stack.append(curr_char)
i += 1
full_obj = html[:i]
return full_obj # noqa: R504
def parse_for_object_from_startpoint(html, start_point):
"""JSONifies an object parsed from HTML.
:param str html:
HTML to be parsed for an object.
:param int start_point:
Index of where the object starts.
:rtype dict:
:returns:
A dict created from parsing the object.
"""
full_obj = find_object_from_startpoint(html, start_point)
try:
return json.loads(full_obj)
except json.decoder.JSONDecodeError:
try:
return ast.literal_eval(full_obj)
except (ValueError, SyntaxError):
raise HTMLParseError('Could not parse object.')
def throttling_array_split(js_array):
"""Parses the throttling array into a python list of strings.
Expects input to begin with `[` and close with `]`.
:param str js_array:
The javascript array, as a string.
:rtype: list:
:returns:
A list of strings representing splits on `,` in the throttling array.
"""
results = []
curr_substring = js_array[1:]
comma_regex = re.compile(r",")
func_regex = re.compile(r"function\([^)]*\)")
while len(curr_substring) > 0:
if curr_substring.startswith('function'):
# Handle functions separately. These can contain commas
match = func_regex.search(curr_substring)
match_start, match_end = match.span()
function_text = find_object_from_startpoint(curr_substring, match.span()[1])
full_function_def = curr_substring[:match_end + len(function_text)]
results.append(full_function_def)
curr_substring = curr_substring[len(full_function_def) + 1:]
else:
match = comma_regex.search(curr_substring)
# Try-catch to capture end of array
try:
match_start, match_end = match.span()
except AttributeError:
match_start = len(curr_substring) - 1
match_end = match_start + 1
curr_el = curr_substring[:match_start]
results.append(curr_el)
curr_substring = curr_substring[match_end:]
return results
|