Spaces:

TIMBOVILL
/

webpage2html

Runtime error

App Files Files Community

webpage2html / webpage2html.py

TIMBOVILL

Upload 3 files

59acea6 verified 12 months ago

raw

history blame contribute delete

16.2 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	from __future__ import print_function

	import argparse
	import base64
	import codecs
	import datetime
	import os
	import re
	import sys

	import requests
	from bs4 import BeautifulSoup
	from termcolor import colored

	if sys.version > '3':
	from urllib.parse import urlparse, urlunsplit, urljoin, quote
	else:
	from urlparse import urlparse, urlunsplit, urljoin
	from urllib import quote

	re_css_url = re.compile(r'(url$.*?$)')
	webpage2html_cache = {}


	def log(s, color=None, on_color=None, attrs=None, new_line=True):
	if not color:
	print(str(s), end=' ', file=sys.stderr)
	else:
	print(colored(str(s), color, on_color, attrs), end=' ', file=sys.stderr)
	if new_line:
	sys.stderr.write('\n')
	sys.stderr.flush()


	def absurl(index, relpath=None, normpath=None):
	if normpath is None:
	normpath = lambda x: x
	if index.lower().startswith('http') or (relpath and relpath.startswith('http')):
	new = urlparse(urljoin(index, relpath))
	return urlunsplit((new.scheme, new.netloc, normpath(new.path), new.query, ''))
	# normpath不是函数，为什么这里一直用normpath(path)这种格式
	# netloc contains basic auth, so do not use domain
	else:
	if relpath:
	return normpath(os.path.join(os.path.dirname(index), relpath))
	else:
	return index


	def get(index, relpath=None, verbose=True, usecache=True, verify=True, ignore_error=False, username=None, password=None):
	global webpage2html_cache
	if index.startswith('http') or (relpath and relpath.startswith('http')):
	full_path = absurl(index, relpath)
	if not full_path:
	if verbose:
	log('[ WARN ] invalid path, %s %s' % (index, relpath), 'yellow')
	return '', None
	# urllib2 only accepts valid url, the following code is taken from urllib
	# http://svn.python.org/view/python/trunk/Lib/urllib.py?r1=71780&r2=71779&pathrev=71780
	full_path = quote(full_path, safe="%/:=&?~#+!$,;'@()*[]")
	if usecache:
	if full_path in webpage2html_cache:
	if verbose:
	log('[ CACHE HIT ] - %s' % full_path)
	return webpage2html_cache[full_path], None
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
	}

	auth = None
	if username and password:
	auth = requests.auth.HTTPBasicAuth(username, password)

	try:
	response = requests.get(full_path, headers=headers, verify=verify, auth=auth)
	if verbose:
	log('[ GET ] %d - %s' % (response.status_code, response.url))
	if not ignore_error and (response.status_code >= 400 or response.status_code < 200):
	content = ''
	elif response.headers.get('content-type', '').lower().startswith('text/'):
	content = response.text
	else:
	content = response.content
	if usecache:
	webpage2html_cache[response.url] = content
	return content, {'url': response.url, 'content-type': response.headers.get('content-type')}
	except Exception as ex:
	if verbose:
	log('[ WARN ] %s - %s %s' % ('???', full_path, ex), 'yellow')
	return '', None
	elif os.path.exists(index):
	if relpath:
	relpath = relpath.split('#')[0].split('?')[0]
	if os.path.exists(relpath):
	full_path = relpath
	else:
	full_path = os.path.normpath(os.path.join(os.path.dirname(index), relpath))
	try:
	ret = open(full_path, 'rb').read()
	if verbose:
	log('[ LOCAL ] found - %s' % full_path)
	return ret, None
	except IOError as err:
	if verbose:
	log('[ WARN ] file not found - %s %s' % (full_path, str(err)), 'yellow')
	return '', None
	else:
	try:
	ret = open(index, 'rb').read()
	if verbose:
	log('[ LOCAL ] found - %s' % index)
	return ret, None
	except IOError as err:
	if verbose:
	log('[ WARN ] file not found - %s %s' % (index, str(err)), 'yellow')
	return '', None
	else:
	if verbose:
	log('[ ERROR ] invalid index - %s' % index, 'red')
	return '', None


	def data_to_base64(index, src, verbose=True):
	# doc here: http://en.wikipedia.org/wiki/Data_URI_scheme
	sp = urlparse(src).path.lower()
	if src.strip().startswith('data:'):
	return src
	if sp.endswith('.png'):
	fmt = 'image/png'
	elif sp.endswith('.gif'):
	fmt = 'image/gif'
	elif sp.endswith('.ico'):
	fmt = 'image/x-icon'
	elif sp.endswith('.jpg') or sp.endswith('.jpeg'):
	fmt = 'image/jpg'
	elif sp.endswith('.svg'):
	fmt = 'image/svg+xml'
	elif sp.endswith('.ttf'):
	fmt = 'application/x-font-ttf'
	elif sp.endswith('.otf'):
	fmt = 'application/x-font-opentype'
	elif sp.endswith('.woff'):
	fmt = 'application/font-woff'
	elif sp.endswith('.woff2'):
	fmt = 'application/font-woff2'
	elif sp.endswith('.eot'):
	fmt = 'application/vnd.ms-fontobject'
	elif sp.endswith('.sfnt'):
	fmt = 'application/font-sfnt'
	elif sp.endswith('.css') or sp.endswith('.less'):
	fmt = 'text/css'
	elif sp.endswith('.js'):
	fmt = 'application/javascript'
	else:
	# what if it's not a valid font type? may not matter
	fmt = 'image/png'
	data, extra_data = get(index, src, verbose=verbose)
	if extra_data and extra_data.get('content-type'):
	fmt = extra_data.get('content-type').replace(' ', '')
	if data:
	if sys.version > '3':
	if type(data) is bytes:
	return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(data))
	else:
	return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(str.encode(data)))
	else:
	reload(sys)
	sys.setdefaultencoding('utf-8')
	return ('data:%s;base64,' % fmt) + base64.b64encode(data)
	else:
	return absurl(index, src)


	css_encoding_re = re.compile(r'''@charset\s+["']([-_a-zA-Z0-9]+)["']\;''', re.I)


	def handle_css_content(index, css, verbose=True):
	if not css:
	return css
	if not isinstance(css, str):
	if sys.version > '3':
	css = bytes.decode(css)
	mo = css_encoding_re.search(css)
	else:
	mo = css_encoding_re.search(css)
	if mo:
	try:
	css = css.decode(mo.group(1))
	except:
	log('[ WARN ] failed to convert css to encoding %s' % mo.group(1), 'yellow')
	# Watch out! how to handle urls which contain parentheses inside? Oh god, css does not support such kind of urls
	# I tested such url in css, and, unfortunately, the css rule is broken. LOL!
	# I have to say that, CSS is awesome!
	reg = re.compile(r'url\s*$(.+?)$')

	def repl(matchobj):
	src = matchobj.group(1).strip(' \'"')
	# if src.lower().endswith('woff') or src.lower().endswith('ttf') or src.lower().endswith('otf') or src.lower().endswith('eot'):
	# # dont handle font data uri currently
	# return 'url(' + src + ')'
	return 'url(' + data_to_base64(index, src, verbose=verbose) + ')'

	css = reg.sub(repl, css)
	return css


	def generate(index, verbose=True, comment=True, keep_script=False, prettify=False, full_url=True, verify=True,
	errorpage=False, username=None, password=None, **kwargs):
	"""
	given a index url such as http://www.google.com, http://custom.domain/index.html
	return generated single html
	"""
	html_doc, extra_data = get(index, verbose=verbose, verify=verify, ignore_error=errorpage,
	username=username, password=password)

	if extra_data and extra_data.get('url'):
	index = extra_data['url']

	# now build the dom tree
	soup = BeautifulSoup(html_doc, 'lxml')
	soup_title = soup.title.string if soup.title else ''

	for link in soup('link'):
	if link.get('href'):
	if 'mask-icon' in (link.get('rel') or []) or 'icon' in (link.get('rel') or []) or 'apple-touch-icon' in (
	link.get('rel') or []) or 'apple-touch-icon-precomposed' in (link.get('rel') or []):
	link['data-href'] = link['href']

	link['href'] = data_to_base64(index, link['href'], verbose=verbose)
	elif link.get('type') == 'text/css' or link['href'].lower().endswith('.css') or 'stylesheet' in (
	link.get('rel') or []):
	new_type = 'text/css' if not link.get('type') else link['type']
	css = soup.new_tag('style', type=new_type)
	css['data-href'] = link['href']
	for attr in link.attrs:
	if attr in ['href']:
	continue
	css[attr] = link[attr]
	css_data, _ = get(index, relpath=link['href'], verbose=verbose)
	new_css_content = handle_css_content(absurl(index, link['href']), css_data, verbose=verbose)
	# if "stylesheet/less" in '\n'.join(link.get('rel') or []).lower(): # fix browser side less: http://lesscss.org/#client-side-usage
	# # link['href'] = 'data:text/less;base64,' + base64.b64encode(css_data)
	# link['data-href'] = link['href']
	# link['href'] = absurl(index, link['href'])
	if False: # new_css_content.find('@font-face') > -1 or new_css_content.find('@FONT-FACE') > -1:
	link['href'] = 'data:text/css;base64,' + base64.b64encode(new_css_content)
	else:
	css.string = new_css_content
	link.replace_with(css)
	elif full_url:
	link['data-href'] = link['href']
	link['href'] = absurl(index, link['href'])
	for js in soup('script'):
	if not keep_script:
	js.replace_with('')
	continue
	if not js.get('src'):
	continue
	new_type = 'text/javascript' if not js.has_attr('type') or not js['type'] else js['type']
	code = soup.new_tag('script', type=new_type)
	code['data-src'] = js['src']
	js_str, _ = get(index, relpath=js['src'], verbose=verbose)
	if type(js_str) == bytes:
	js_str = js_str.decode('utf-8')
	try:
	if js_str.find('</script>') > -1:
	code['src'] = 'data:text/javascript;base64,' + base64.b64encode(js_str.encode()).decode()
	elif js_str.find(']]>') < 0:
	code.string = '<!--//--><![CDATA[//><!--\n' + js_str + '\n//--><!]]>'
	else:
	# replace ]]> does not work at all for chrome, do not believe
	# http://en.wikipedia.org/wiki/CDATA
	# code.string = '<![CDATA[\n' + js_str.replace(']]>', ']]]]><![CDATA[>') + '\n]]>'
	code.string = js_str
	except:
	if verbose:
	log(repr(js_str))
	raise
	js.replace_with(code)
	for img in soup('img'):
	if not img.get('src'):
	continue
	img['data-src'] = img['src']
	img['src'] = data_to_base64(index, img['src'], verbose=verbose)

	# `img` elements may have `srcset` attributes with multiple sets of images.
	# To get a lighter document it will be cleared, and used only the standard `src` attribute
	# Maybe add a flag to enable the base64 conversion of each `srcset`?
	# For now a simple warning is displayed informing that image has multiple sources
	# that are stripped.

	if img.get('srcset'):
	img['data-srcset'] = img['srcset']
	del img['srcset']
	if verbose:
	log('[ WARN ] srcset found in img tag. Attribute will be cleared. File src => %s' % (img['data-src']),
	'yellow')

	def check_alt(attr):
	if img.has_attr(attr) and img[attr].startswith('this.src='):
	# we do not handle this situation yet, just warn the user
	if verbose:
	log('[ WARN ] %s found in img tag and unhandled, which may break page' % (attr), 'yellow')

	check_alt('onerror')
	check_alt('onmouseover')
	check_alt('onmouseout')
	for tag in soup(True):
	if full_url and tag.name == 'a' and tag.has_attr('href') and not tag['href'].startswith('#'):
	tag['data-href'] = tag['href']
	tag['href'] = absurl(index, tag['href'])
	if tag.has_attr('style'):
	if tag['style']:
	tag['style'] = handle_css_content(index, tag['style'], verbose=verbose)
	elif tag.name == 'link' and tag.has_attr('type') and tag['type'] == 'text/css':
	if tag.string:
	tag.string = handle_css_content(index, tag.string, verbose=verbose)
	elif tag.name == 'style':
	if tag.string:
	tag.string = handle_css_content(index, tag.string, verbose=verbose)

	# finally insert some info into comments
	if comment:
	for html in soup('html'):
	html.insert(0, BeautifulSoup('<!-- \n single html processed by https://github.com/zTrix/webpage2html\n '
	'title: %s\n url: %s\n date: %s\n-->' % (soup_title, index, datetime.datetime.
	now().ctime()), 'lxml'))
	break
	if prettify:
	return soup.prettify(formatter='html')
	else:
	return str(soup)


	def usage():
	print("""
	usage:

	$ webpage2html [options] some_url

	options:

	-h, --help help page, you are reading this now!
	-q, --quiet don't show verbose url get log in stderr
	-s, --script keep javascript in the generated html

	examples:

	$ webpage2html -h
	you are reading this help message

	$ webpage2html http://www.google.com > google.html
	save google index page for offline reading, keep style untainted

	$ webpage2html -s http://gabrielecirulli.github.io/2048/ > 2048.html
	save dynamic page with Javascript example
	the 2048 game can be played offline after being saved

	$ webpage2html /path/to/xxx.html > xxx_single.html
	combine local saved xxx.html with a directory named xxx_files together into a single html file
	""")


	def main():
	kwargs = {}
	parser = argparse.ArgumentParser()
	parser.add_argument('-q', '--quiet', action='store_true', help="don't show verbose url get log in stderr")
	parser.add_argument('-s', '--script', action='store_true', help="keep javascript in the generated html")
	parser.add_argument('-k', '--insecure', action='store_true', help="ignore the certificate")
	parser.add_argument('-o', '--output', help="save output to")
	parser.add_argument('-u', '--username', help="use HTTP basic auth with specified username")
	parser.add_argument('-p', '--password', help="use HTTP basic auth with specified password")
	parser.add_argument('--errorpage', action='store_true', help="crawl an error page")
	parser.add_argument("url", help="the website to store")
	args = parser.parse_args()

	args.verbose = not args.quiet
	args.keep_script = args.script
	args.verify = not args.insecure
	args.index = args.url
	kwargs = vars(args)

	rs = generate(**kwargs)
	if args.output and args.output != '-':
	with open(args.output, 'wb') as f:
	f.write(rs.encode())
	else:
	sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
	sys.stdout.write(rs)


	if __name__ == '__main__':
	main()