Spaces:

wldmr
/

tubifier

Sleeping

App Files Files Community

tubifier / pytube /parser.py

wldmr

app file

837fdb6 about 2 years ago

raw

history blame

5.53 kB

	import ast
	import json
	import re
	from pytube.exceptions import HTMLParseError


	def parse_for_all_objects(html, preceding_regex):
	"""Parses input html to find all matches for the input starting point.

	:param str html:
	HTML to be parsed for an object.
	:param str preceding_regex:
	Regex to find the string preceding the object.
	:rtype list:
	:returns:
	A list of dicts created from parsing the objects.
	"""
	result = []
	regex = re.compile(preceding_regex)
	match_iter = regex.finditer(html)
	for match in match_iter:
	if match:
	start_index = match.end()
	try:
	obj = parse_for_object_from_startpoint(html, start_index)
	except HTMLParseError:
	# Some of the instances might fail because set is technically
	# a method of the ytcfg object. We'll skip these since they
	# don't seem relevant at the moment.
	continue
	else:
	result.append(obj)

	if len(result) == 0:
	raise HTMLParseError(f'No matches for regex {preceding_regex}')

	return result


	def parse_for_object(html, preceding_regex):
	"""Parses input html to find the end of a JavaScript object.

	:param str html:
	HTML to be parsed for an object.
	:param str preceding_regex:
	Regex to find the string preceding the object.
	:rtype dict:
	:returns:
	A dict created from parsing the object.
	"""
	regex = re.compile(preceding_regex)
	result = regex.search(html)
	if not result:
	raise HTMLParseError(f'No matches for regex {preceding_regex}')

	start_index = result.end()
	return parse_for_object_from_startpoint(html, start_index)


	def find_object_from_startpoint(html, start_point):
	"""Parses input html to find the end of a JavaScript object.

	:param str html:
	HTML to be parsed for an object.
	:param int start_point:
	Index of where the object starts.
	:rtype dict:
	:returns:
	A dict created from parsing the object.
	"""
	html = html[start_point:]
	if html[0] not in ['{','[']:
	raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}')

	# First letter MUST be a open brace, so we put that in the stack,
	# and skip the first character.
	stack = [html[0]]
	i = 1

	context_closers = {
	'{': '}',
	'[': ']',
	'"': '"'
	}

	while i < len(html):
	if len(stack) == 0:
	break
	curr_char = html[i]
	curr_context = stack[-1]

	# If we've reached a context closer, we can remove an element off the stack
	if curr_char == context_closers[curr_context]:
	stack.pop()
	i += 1
	continue

	# Strings require special context handling because they can contain
	# context openers and closers
	if curr_context == '"':
	# If there's a backslash in a string, we skip a character
	if curr_char == '\\':
	i += 2
	continue
	else:
	# Non-string contexts are when we need to look for context openers.
	if curr_char in context_closers.keys():
	stack.append(curr_char)

	i += 1

	full_obj = html[:i]
	return full_obj # noqa: R504


	def parse_for_object_from_startpoint(html, start_point):
	"""JSONifies an object parsed from HTML.

	:param str html:
	HTML to be parsed for an object.
	:param int start_point:
	Index of where the object starts.
	:rtype dict:
	:returns:
	A dict created from parsing the object.
	"""
	full_obj = find_object_from_startpoint(html, start_point)
	try:
	return json.loads(full_obj)
	except json.decoder.JSONDecodeError:
	try:
	return ast.literal_eval(full_obj)
	except (ValueError, SyntaxError):
	raise HTMLParseError('Could not parse object.')


	def throttling_array_split(js_array):
	"""Parses the throttling array into a python list of strings.

	Expects input to begin with `[` and close with `]`.

	:param str js_array:
	The javascript array, as a string.
	:rtype: list:
	:returns:
	A list of strings representing splits on `,` in the throttling array.
	"""
	results = []
	curr_substring = js_array[1:]

	comma_regex = re.compile(r",")
	func_regex = re.compile(r"function\([^)]*\)")

	while len(curr_substring) > 0:
	if curr_substring.startswith('function'):
	# Handle functions separately. These can contain commas
	match = func_regex.search(curr_substring)
	match_start, match_end = match.span()

	function_text = find_object_from_startpoint(curr_substring, match.span()[1])
	full_function_def = curr_substring[:match_end + len(function_text)]
	results.append(full_function_def)
	curr_substring = curr_substring[len(full_function_def) + 1:]
	else:
	match = comma_regex.search(curr_substring)

	# Try-catch to capture end of array
	try:
	match_start, match_end = match.span()
	except AttributeError:
	match_start = len(curr_substring) - 1
	match_end = match_start + 1

	curr_el = curr_substring[:match_start]
	results.append(curr_el)
	curr_substring = curr_substring[match_end:]

	return results