webpage2html / webpage2html.py
TIMBOVILL's picture
Upload 3 files
59acea6 verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import argparse
import base64
import codecs
import datetime
import os
import re
import sys
import requests
from bs4 import BeautifulSoup
from termcolor import colored
if sys.version > '3':
from urllib.parse import urlparse, urlunsplit, urljoin, quote
else:
from urlparse import urlparse, urlunsplit, urljoin
from urllib import quote
re_css_url = re.compile(r'(url\(.*?\))')
webpage2html_cache = {}
def log(s, color=None, on_color=None, attrs=None, new_line=True):
if not color:
print(str(s), end=' ', file=sys.stderr)
else:
print(colored(str(s), color, on_color, attrs), end=' ', file=sys.stderr)
if new_line:
sys.stderr.write('\n')
sys.stderr.flush()
def absurl(index, relpath=None, normpath=None):
if normpath is None:
normpath = lambda x: x
if index.lower().startswith('http') or (relpath and relpath.startswith('http')):
new = urlparse(urljoin(index, relpath))
return urlunsplit((new.scheme, new.netloc, normpath(new.path), new.query, ''))
# normpath不是函数,为什么这里一直用normpath(path)这种格式
# netloc contains basic auth, so do not use domain
else:
if relpath:
return normpath(os.path.join(os.path.dirname(index), relpath))
else:
return index
def get(index, relpath=None, verbose=True, usecache=True, verify=True, ignore_error=False, username=None, password=None):
global webpage2html_cache
if index.startswith('http') or (relpath and relpath.startswith('http')):
full_path = absurl(index, relpath)
if not full_path:
if verbose:
log('[ WARN ] invalid path, %s %s' % (index, relpath), 'yellow')
return '', None
# urllib2 only accepts valid url, the following code is taken from urllib
# http://svn.python.org/view/python/trunk/Lib/urllib.py?r1=71780&r2=71779&pathrev=71780
full_path = quote(full_path, safe="%/:=&?~#+!$,;'@()*[]")
if usecache:
if full_path in webpage2html_cache:
if verbose:
log('[ CACHE HIT ] - %s' % full_path)
return webpage2html_cache[full_path], None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
auth = None
if username and password:
auth = requests.auth.HTTPBasicAuth(username, password)
try:
response = requests.get(full_path, headers=headers, verify=verify, auth=auth)
if verbose:
log('[ GET ] %d - %s' % (response.status_code, response.url))
if not ignore_error and (response.status_code >= 400 or response.status_code < 200):
content = ''
elif response.headers.get('content-type', '').lower().startswith('text/'):
content = response.text
else:
content = response.content
if usecache:
webpage2html_cache[response.url] = content
return content, {'url': response.url, 'content-type': response.headers.get('content-type')}
except Exception as ex:
if verbose:
log('[ WARN ] %s - %s %s' % ('???', full_path, ex), 'yellow')
return '', None
elif os.path.exists(index):
if relpath:
relpath = relpath.split('#')[0].split('?')[0]
if os.path.exists(relpath):
full_path = relpath
else:
full_path = os.path.normpath(os.path.join(os.path.dirname(index), relpath))
try:
ret = open(full_path, 'rb').read()
if verbose:
log('[ LOCAL ] found - %s' % full_path)
return ret, None
except IOError as err:
if verbose:
log('[ WARN ] file not found - %s %s' % (full_path, str(err)), 'yellow')
return '', None
else:
try:
ret = open(index, 'rb').read()
if verbose:
log('[ LOCAL ] found - %s' % index)
return ret, None
except IOError as err:
if verbose:
log('[ WARN ] file not found - %s %s' % (index, str(err)), 'yellow')
return '', None
else:
if verbose:
log('[ ERROR ] invalid index - %s' % index, 'red')
return '', None
def data_to_base64(index, src, verbose=True):
# doc here: http://en.wikipedia.org/wiki/Data_URI_scheme
sp = urlparse(src).path.lower()
if src.strip().startswith('data:'):
return src
if sp.endswith('.png'):
fmt = 'image/png'
elif sp.endswith('.gif'):
fmt = 'image/gif'
elif sp.endswith('.ico'):
fmt = 'image/x-icon'
elif sp.endswith('.jpg') or sp.endswith('.jpeg'):
fmt = 'image/jpg'
elif sp.endswith('.svg'):
fmt = 'image/svg+xml'
elif sp.endswith('.ttf'):
fmt = 'application/x-font-ttf'
elif sp.endswith('.otf'):
fmt = 'application/x-font-opentype'
elif sp.endswith('.woff'):
fmt = 'application/font-woff'
elif sp.endswith('.woff2'):
fmt = 'application/font-woff2'
elif sp.endswith('.eot'):
fmt = 'application/vnd.ms-fontobject'
elif sp.endswith('.sfnt'):
fmt = 'application/font-sfnt'
elif sp.endswith('.css') or sp.endswith('.less'):
fmt = 'text/css'
elif sp.endswith('.js'):
fmt = 'application/javascript'
else:
# what if it's not a valid font type? may not matter
fmt = 'image/png'
data, extra_data = get(index, src, verbose=verbose)
if extra_data and extra_data.get('content-type'):
fmt = extra_data.get('content-type').replace(' ', '')
if data:
if sys.version > '3':
if type(data) is bytes:
return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(data))
else:
return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(str.encode(data)))
else:
reload(sys)
sys.setdefaultencoding('utf-8')
return ('data:%s;base64,' % fmt) + base64.b64encode(data)
else:
return absurl(index, src)
css_encoding_re = re.compile(r'''@charset\s+["']([-_a-zA-Z0-9]+)["']\;''', re.I)
def handle_css_content(index, css, verbose=True):
if not css:
return css
if not isinstance(css, str):
if sys.version > '3':
css = bytes.decode(css)
mo = css_encoding_re.search(css)
else:
mo = css_encoding_re.search(css)
if mo:
try:
css = css.decode(mo.group(1))
except:
log('[ WARN ] failed to convert css to encoding %s' % mo.group(1), 'yellow')
# Watch out! how to handle urls which contain parentheses inside? Oh god, css does not support such kind of urls
# I tested such url in css, and, unfortunately, the css rule is broken. LOL!
# I have to say that, CSS is awesome!
reg = re.compile(r'url\s*\((.+?)\)')
def repl(matchobj):
src = matchobj.group(1).strip(' \'"')
# if src.lower().endswith('woff') or src.lower().endswith('ttf') or src.lower().endswith('otf') or src.lower().endswith('eot'):
# # dont handle font data uri currently
# return 'url(' + src + ')'
return 'url(' + data_to_base64(index, src, verbose=verbose) + ')'
css = reg.sub(repl, css)
return css
def generate(index, verbose=True, comment=True, keep_script=False, prettify=False, full_url=True, verify=True,
errorpage=False, username=None, password=None, **kwargs):
"""
given a index url such as http://www.google.com, http://custom.domain/index.html
return generated single html
"""
html_doc, extra_data = get(index, verbose=verbose, verify=verify, ignore_error=errorpage,
username=username, password=password)
if extra_data and extra_data.get('url'):
index = extra_data['url']
# now build the dom tree
soup = BeautifulSoup(html_doc, 'lxml')
soup_title = soup.title.string if soup.title else ''
for link in soup('link'):
if link.get('href'):
if 'mask-icon' in (link.get('rel') or []) or 'icon' in (link.get('rel') or []) or 'apple-touch-icon' in (
link.get('rel') or []) or 'apple-touch-icon-precomposed' in (link.get('rel') or []):
link['data-href'] = link['href']
link['href'] = data_to_base64(index, link['href'], verbose=verbose)
elif link.get('type') == 'text/css' or link['href'].lower().endswith('.css') or 'stylesheet' in (
link.get('rel') or []):
new_type = 'text/css' if not link.get('type') else link['type']
css = soup.new_tag('style', type=new_type)
css['data-href'] = link['href']
for attr in link.attrs:
if attr in ['href']:
continue
css[attr] = link[attr]
css_data, _ = get(index, relpath=link['href'], verbose=verbose)
new_css_content = handle_css_content(absurl(index, link['href']), css_data, verbose=verbose)
# if "stylesheet/less" in '\n'.join(link.get('rel') or []).lower(): # fix browser side less: http://lesscss.org/#client-side-usage
# # link['href'] = 'data:text/less;base64,' + base64.b64encode(css_data)
# link['data-href'] = link['href']
# link['href'] = absurl(index, link['href'])
if False: # new_css_content.find('@font-face') > -1 or new_css_content.find('@FONT-FACE') > -1:
link['href'] = 'data:text/css;base64,' + base64.b64encode(new_css_content)
else:
css.string = new_css_content
link.replace_with(css)
elif full_url:
link['data-href'] = link['href']
link['href'] = absurl(index, link['href'])
for js in soup('script'):
if not keep_script:
js.replace_with('')
continue
if not js.get('src'):
continue
new_type = 'text/javascript' if not js.has_attr('type') or not js['type'] else js['type']
code = soup.new_tag('script', type=new_type)
code['data-src'] = js['src']
js_str, _ = get(index, relpath=js['src'], verbose=verbose)
if type(js_str) == bytes:
js_str = js_str.decode('utf-8')
try:
if js_str.find('</script>') > -1:
code['src'] = 'data:text/javascript;base64,' + base64.b64encode(js_str.encode()).decode()
elif js_str.find(']]>') < 0:
code.string = '<!--//--><![CDATA[//><!--\n' + js_str + '\n//--><!]]>'
else:
# replace ]]> does not work at all for chrome, do not believe
# http://en.wikipedia.org/wiki/CDATA
# code.string = '<![CDATA[\n' + js_str.replace(']]>', ']]]]><![CDATA[>') + '\n]]>'
code.string = js_str
except:
if verbose:
log(repr(js_str))
raise
js.replace_with(code)
for img in soup('img'):
if not img.get('src'):
continue
img['data-src'] = img['src']
img['src'] = data_to_base64(index, img['src'], verbose=verbose)
# `img` elements may have `srcset` attributes with multiple sets of images.
# To get a lighter document it will be cleared, and used only the standard `src` attribute
# Maybe add a flag to enable the base64 conversion of each `srcset`?
# For now a simple warning is displayed informing that image has multiple sources
# that are stripped.
if img.get('srcset'):
img['data-srcset'] = img['srcset']
del img['srcset']
if verbose:
log('[ WARN ] srcset found in img tag. Attribute will be cleared. File src => %s' % (img['data-src']),
'yellow')
def check_alt(attr):
if img.has_attr(attr) and img[attr].startswith('this.src='):
# we do not handle this situation yet, just warn the user
if verbose:
log('[ WARN ] %s found in img tag and unhandled, which may break page' % (attr), 'yellow')
check_alt('onerror')
check_alt('onmouseover')
check_alt('onmouseout')
for tag in soup(True):
if full_url and tag.name == 'a' and tag.has_attr('href') and not tag['href'].startswith('#'):
tag['data-href'] = tag['href']
tag['href'] = absurl(index, tag['href'])
if tag.has_attr('style'):
if tag['style']:
tag['style'] = handle_css_content(index, tag['style'], verbose=verbose)
elif tag.name == 'link' and tag.has_attr('type') and tag['type'] == 'text/css':
if tag.string:
tag.string = handle_css_content(index, tag.string, verbose=verbose)
elif tag.name == 'style':
if tag.string:
tag.string = handle_css_content(index, tag.string, verbose=verbose)
# finally insert some info into comments
if comment:
for html in soup('html'):
html.insert(0, BeautifulSoup('<!-- \n single html processed by https://github.com/zTrix/webpage2html\n '
'title: %s\n url: %s\n date: %s\n-->' % (soup_title, index, datetime.datetime.
now().ctime()), 'lxml'))
break
if prettify:
return soup.prettify(formatter='html')
else:
return str(soup)
def usage():
print("""
usage:
$ webpage2html [options] some_url
options:
-h, --help help page, you are reading this now!
-q, --quiet don't show verbose url get log in stderr
-s, --script keep javascript in the generated html
examples:
$ webpage2html -h
you are reading this help message
$ webpage2html http://www.google.com > google.html
save google index page for offline reading, keep style untainted
$ webpage2html -s http://gabrielecirulli.github.io/2048/ > 2048.html
save dynamic page with Javascript example
the 2048 game can be played offline after being saved
$ webpage2html /path/to/xxx.html > xxx_single.html
combine local saved xxx.html with a directory named xxx_files together into a single html file
""")
def main():
kwargs = {}
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quiet', action='store_true', help="don't show verbose url get log in stderr")
parser.add_argument('-s', '--script', action='store_true', help="keep javascript in the generated html")
parser.add_argument('-k', '--insecure', action='store_true', help="ignore the certificate")
parser.add_argument('-o', '--output', help="save output to")
parser.add_argument('-u', '--username', help="use HTTP basic auth with specified username")
parser.add_argument('-p', '--password', help="use HTTP basic auth with specified password")
parser.add_argument('--errorpage', action='store_true', help="crawl an error page")
parser.add_argument("url", help="the website to store")
args = parser.parse_args()
args.verbose = not args.quiet
args.keep_script = args.script
args.verify = not args.insecure
args.index = args.url
kwargs = vars(args)
rs = generate(**kwargs)
if args.output and args.output != '-':
with open(args.output, 'wb') as f:
f.write(rs.encode())
else:
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
sys.stdout.write(rs)
if __name__ == '__main__':
main()