Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2012-2015 Vinay Sajip. | |
# Licensed to the Python Software Foundation under a contributor agreement. | |
# See LICENSE.txt and CONTRIBUTORS.txt. | |
# | |
import gzip | |
from io import BytesIO | |
import json | |
import logging | |
import os | |
import posixpath | |
import re | |
try: | |
import threading | |
except ImportError: # pragma: no cover | |
import dummy_threading as threading | |
import zlib | |
from . import DistlibException | |
from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url, | |
queue, quote, unescape, build_opener, | |
HTTPRedirectHandler as BaseRedirectHandler, text_type, | |
Request, HTTPError, URLError) | |
from .database import Distribution, DistributionPath, make_dist | |
from .metadata import Metadata, MetadataInvalidError | |
from .util import (cached_property, ensure_slash, split_filename, get_project_data, | |
parse_requirement, parse_name_and_version, ServerProxy, | |
normalize_name) | |
from .version import get_scheme, UnsupportedVersionError | |
from .wheel import Wheel, is_compatible | |
logger = logging.getLogger(__name__) | |
HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)') | |
CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I) | |
HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml') | |
DEFAULT_INDEX = 'https://pypi.org/pypi' | |
def get_all_distribution_names(url=None): | |
""" | |
Return all distribution names known by an index. | |
:param url: The URL of the index. | |
:return: A list of all known distribution names. | |
""" | |
if url is None: | |
url = DEFAULT_INDEX | |
client = ServerProxy(url, timeout=3.0) | |
try: | |
return client.list_packages() | |
finally: | |
client('close')() | |
class RedirectHandler(BaseRedirectHandler): | |
""" | |
A class to work around a bug in some Python 3.2.x releases. | |
""" | |
# There's a bug in the base version for some 3.2.x | |
# (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header | |
# returns e.g. /abc, it bails because it says the scheme '' | |
# is bogus, when actually it should use the request's | |
# URL for the scheme. See Python issue #13696. | |
def http_error_302(self, req, fp, code, msg, headers): | |
# Some servers (incorrectly) return multiple Location headers | |
# (so probably same goes for URI). Use first header. | |
newurl = None | |
for key in ('location', 'uri'): | |
if key in headers: | |
newurl = headers[key] | |
break | |
if newurl is None: # pragma: no cover | |
return | |
urlparts = urlparse(newurl) | |
if urlparts.scheme == '': | |
newurl = urljoin(req.get_full_url(), newurl) | |
if hasattr(headers, 'replace_header'): | |
headers.replace_header(key, newurl) | |
else: | |
headers[key] = newurl | |
return BaseRedirectHandler.http_error_302(self, req, fp, code, msg, | |
headers) | |
http_error_301 = http_error_303 = http_error_307 = http_error_302 | |
class Locator(object): | |
""" | |
A base class for locators - things that locate distributions. | |
""" | |
source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz') | |
binary_extensions = ('.egg', '.exe', '.whl') | |
excluded_extensions = ('.pdf',) | |
# A list of tags indicating which wheels you want to match. The default | |
# value of None matches against the tags compatible with the running | |
# Python. If you want to match other values, set wheel_tags on a locator | |
# instance to a list of tuples (pyver, abi, arch) which you want to match. | |
wheel_tags = None | |
downloadable_extensions = source_extensions + ('.whl',) | |
def __init__(self, scheme='default'): | |
""" | |
Initialise an instance. | |
:param scheme: Because locators look for most recent versions, they | |
need to know the version scheme to use. This specifies | |
the current PEP-recommended scheme - use ``'legacy'`` | |
if you need to support existing distributions on PyPI. | |
""" | |
self._cache = {} | |
self.scheme = scheme | |
# Because of bugs in some of the handlers on some of the platforms, | |
# we use our own opener rather than just using urlopen. | |
self.opener = build_opener(RedirectHandler()) | |
# If get_project() is called from locate(), the matcher instance | |
# is set from the requirement passed to locate(). See issue #18 for | |
# why this can be useful to know. | |
self.matcher = None | |
self.errors = queue.Queue() | |
def get_errors(self): | |
""" | |
Return any errors which have occurred. | |
""" | |
result = [] | |
while not self.errors.empty(): # pragma: no cover | |
try: | |
e = self.errors.get(False) | |
result.append(e) | |
except self.errors.Empty: | |
continue | |
self.errors.task_done() | |
return result | |
def clear_errors(self): | |
""" | |
Clear any errors which may have been logged. | |
""" | |
# Just get the errors and throw them away | |
self.get_errors() | |
def clear_cache(self): | |
self._cache.clear() | |
def _get_scheme(self): | |
return self._scheme | |
def _set_scheme(self, value): | |
self._scheme = value | |
scheme = property(_get_scheme, _set_scheme) | |
def _get_project(self, name): | |
""" | |
For a given project, get a dictionary mapping available versions to Distribution | |
instances. | |
This should be implemented in subclasses. | |
If called from a locate() request, self.matcher will be set to a | |
matcher for the requirement to satisfy, otherwise it will be None. | |
""" | |
raise NotImplementedError('Please implement in the subclass') | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
raise NotImplementedError('Please implement in the subclass') | |
def get_project(self, name): | |
""" | |
For a given project, get a dictionary mapping available versions to Distribution | |
instances. | |
This calls _get_project to do all the work, and just implements a caching layer on top. | |
""" | |
if self._cache is None: # pragma: no cover | |
result = self._get_project(name) | |
elif name in self._cache: | |
result = self._cache[name] | |
else: | |
self.clear_errors() | |
result = self._get_project(name) | |
self._cache[name] = result | |
return result | |
def score_url(self, url): | |
""" | |
Give an url a score which can be used to choose preferred URLs | |
for a given project release. | |
""" | |
t = urlparse(url) | |
basename = posixpath.basename(t.path) | |
compatible = True | |
is_wheel = basename.endswith('.whl') | |
is_downloadable = basename.endswith(self.downloadable_extensions) | |
if is_wheel: | |
compatible = is_compatible(Wheel(basename), self.wheel_tags) | |
return (t.scheme == 'https', 'pypi.org' in t.netloc, | |
is_downloadable, is_wheel, compatible, basename) | |
def prefer_url(self, url1, url2): | |
""" | |
Choose one of two URLs where both are candidates for distribution | |
archives for the same version of a distribution (for example, | |
.tar.gz vs. zip). | |
The current implementation favours https:// URLs over http://, archives | |
from PyPI over those from other locations, wheel compatibility (if a | |
wheel) and then the archive name. | |
""" | |
result = url2 | |
if url1: | |
s1 = self.score_url(url1) | |
s2 = self.score_url(url2) | |
if s1 > s2: | |
result = url1 | |
if result != url2: | |
logger.debug('Not replacing %r with %r', url1, url2) | |
else: | |
logger.debug('Replacing %r with %r', url1, url2) | |
return result | |
def split_filename(self, filename, project_name): | |
""" | |
Attempt to split a filename in project name, version and Python version. | |
""" | |
return split_filename(filename, project_name) | |
def convert_url_to_download_info(self, url, project_name): | |
""" | |
See if a URL is a candidate for a download URL for a project (the URL | |
has typically been scraped from an HTML page). | |
If it is, a dictionary is returned with keys "name", "version", | |
"filename" and "url"; otherwise, None is returned. | |
""" | |
def same_project(name1, name2): | |
return normalize_name(name1) == normalize_name(name2) | |
result = None | |
scheme, netloc, path, params, query, frag = urlparse(url) | |
if frag.lower().startswith('egg='): # pragma: no cover | |
logger.debug('%s: version hint in fragment: %r', | |
project_name, frag) | |
m = HASHER_HASH.match(frag) | |
if m: | |
algo, digest = m.groups() | |
else: | |
algo, digest = None, None | |
origpath = path | |
if path and path[-1] == '/': # pragma: no cover | |
path = path[:-1] | |
if path.endswith('.whl'): | |
try: | |
wheel = Wheel(path) | |
if not is_compatible(wheel, self.wheel_tags): | |
logger.debug('Wheel not compatible: %s', path) | |
else: | |
if project_name is None: | |
include = True | |
else: | |
include = same_project(wheel.name, project_name) | |
if include: | |
result = { | |
'name': wheel.name, | |
'version': wheel.version, | |
'filename': wheel.filename, | |
'url': urlunparse((scheme, netloc, origpath, | |
params, query, '')), | |
'python-version': ', '.join( | |
['.'.join(list(v[2:])) for v in wheel.pyver]), | |
} | |
except Exception as e: # pragma: no cover | |
logger.warning('invalid path for wheel: %s', path) | |
elif not path.endswith(self.downloadable_extensions): # pragma: no cover | |
logger.debug('Not downloadable: %s', path) | |
else: # downloadable extension | |
path = filename = posixpath.basename(path) | |
for ext in self.downloadable_extensions: | |
if path.endswith(ext): | |
path = path[:-len(ext)] | |
t = self.split_filename(path, project_name) | |
if not t: # pragma: no cover | |
logger.debug('No match for project/version: %s', path) | |
else: | |
name, version, pyver = t | |
if not project_name or same_project(project_name, name): | |
result = { | |
'name': name, | |
'version': version, | |
'filename': filename, | |
'url': urlunparse((scheme, netloc, origpath, | |
params, query, '')), | |
#'packagetype': 'sdist', | |
} | |
if pyver: # pragma: no cover | |
result['python-version'] = pyver | |
break | |
if result and algo: | |
result['%s_digest' % algo] = digest | |
return result | |
def _get_digest(self, info): | |
""" | |
Get a digest from a dictionary by looking at a "digests" dictionary | |
or keys of the form 'algo_digest'. | |
Returns a 2-tuple (algo, digest) if found, else None. Currently | |
looks only for SHA256, then MD5. | |
""" | |
result = None | |
if 'digests' in info: | |
digests = info['digests'] | |
for algo in ('sha256', 'md5'): | |
if algo in digests: | |
result = (algo, digests[algo]) | |
break | |
if not result: | |
for algo in ('sha256', 'md5'): | |
key = '%s_digest' % algo | |
if key in info: | |
result = (algo, info[key]) | |
break | |
return result | |
def _update_version_data(self, result, info): | |
""" | |
Update a result dictionary (the final result from _get_project) with a | |
dictionary for a specific version, which typically holds information | |
gleaned from a filename or URL for an archive for the distribution. | |
""" | |
name = info.pop('name') | |
version = info.pop('version') | |
if version in result: | |
dist = result[version] | |
md = dist.metadata | |
else: | |
dist = make_dist(name, version, scheme=self.scheme) | |
md = dist.metadata | |
dist.digest = digest = self._get_digest(info) | |
url = info['url'] | |
result['digests'][url] = digest | |
if md.source_url != info['url']: | |
md.source_url = self.prefer_url(md.source_url, url) | |
result['urls'].setdefault(version, set()).add(url) | |
dist.locator = self | |
result[version] = dist | |
def locate(self, requirement, prereleases=False): | |
""" | |
Find the most recent distribution which matches the given | |
requirement. | |
:param requirement: A requirement of the form 'foo (1.0)' or perhaps | |
'foo (>= 1.0, < 2.0, != 1.3)' | |
:param prereleases: If ``True``, allow pre-release versions | |
to be located. Otherwise, pre-release versions | |
are not returned. | |
:return: A :class:`Distribution` instance, or ``None`` if no such | |
distribution could be located. | |
""" | |
result = None | |
r = parse_requirement(requirement) | |
if r is None: # pragma: no cover | |
raise DistlibException('Not a valid requirement: %r' % requirement) | |
scheme = get_scheme(self.scheme) | |
self.matcher = matcher = scheme.matcher(r.requirement) | |
logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__) | |
versions = self.get_project(r.name) | |
if len(versions) > 2: # urls and digests keys are present | |
# sometimes, versions are invalid | |
slist = [] | |
vcls = matcher.version_class | |
for k in versions: | |
if k in ('urls', 'digests'): | |
continue | |
try: | |
if not matcher.match(k): | |
pass # logger.debug('%s did not match %r', matcher, k) | |
else: | |
if prereleases or not vcls(k).is_prerelease: | |
slist.append(k) | |
# else: | |
# logger.debug('skipping pre-release ' | |
# 'version %s of %s', k, matcher.name) | |
except Exception: # pragma: no cover | |
logger.warning('error matching %s with %r', matcher, k) | |
pass # slist.append(k) | |
if len(slist) > 1: | |
slist = sorted(slist, key=scheme.key) | |
if slist: | |
logger.debug('sorted list: %s', slist) | |
version = slist[-1] | |
result = versions[version] | |
if result: | |
if r.extras: | |
result.extras = r.extras | |
result.download_urls = versions.get('urls', {}).get(version, set()) | |
d = {} | |
sd = versions.get('digests', {}) | |
for url in result.download_urls: | |
if url in sd: # pragma: no cover | |
d[url] = sd[url] | |
result.digests = d | |
self.matcher = None | |
return result | |
class PyPIRPCLocator(Locator): | |
""" | |
This locator uses XML-RPC to locate distributions. It therefore | |
cannot be used with simple mirrors (that only mirror file content). | |
""" | |
def __init__(self, url, **kwargs): | |
""" | |
Initialise an instance. | |
:param url: The URL to use for XML-RPC. | |
:param kwargs: Passed to the superclass constructor. | |
""" | |
super(PyPIRPCLocator, self).__init__(**kwargs) | |
self.base_url = url | |
self.client = ServerProxy(url, timeout=3.0) | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
return set(self.client.list_packages()) | |
def _get_project(self, name): | |
result = {'urls': {}, 'digests': {}} | |
versions = self.client.package_releases(name, True) | |
for v in versions: | |
urls = self.client.release_urls(name, v) | |
data = self.client.release_data(name, v) | |
metadata = Metadata(scheme=self.scheme) | |
metadata.name = data['name'] | |
metadata.version = data['version'] | |
metadata.license = data.get('license') | |
metadata.keywords = data.get('keywords', []) | |
metadata.summary = data.get('summary') | |
dist = Distribution(metadata) | |
if urls: | |
info = urls[0] | |
metadata.source_url = info['url'] | |
dist.digest = self._get_digest(info) | |
dist.locator = self | |
result[v] = dist | |
for info in urls: | |
url = info['url'] | |
digest = self._get_digest(info) | |
result['urls'].setdefault(v, set()).add(url) | |
result['digests'][url] = digest | |
return result | |
class PyPIJSONLocator(Locator): | |
""" | |
This locator uses PyPI's JSON interface. It's very limited in functionality | |
and probably not worth using. | |
""" | |
def __init__(self, url, **kwargs): | |
super(PyPIJSONLocator, self).__init__(**kwargs) | |
self.base_url = ensure_slash(url) | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
raise NotImplementedError('Not available from this locator') | |
def _get_project(self, name): | |
result = {'urls': {}, 'digests': {}} | |
url = urljoin(self.base_url, '%s/json' % quote(name)) | |
try: | |
resp = self.opener.open(url) | |
data = resp.read().decode() # for now | |
d = json.loads(data) | |
md = Metadata(scheme=self.scheme) | |
data = d['info'] | |
md.name = data['name'] | |
md.version = data['version'] | |
md.license = data.get('license') | |
md.keywords = data.get('keywords', []) | |
md.summary = data.get('summary') | |
dist = Distribution(md) | |
dist.locator = self | |
urls = d['urls'] | |
result[md.version] = dist | |
for info in d['urls']: | |
url = info['url'] | |
dist.download_urls.add(url) | |
dist.digests[url] = self._get_digest(info) | |
result['urls'].setdefault(md.version, set()).add(url) | |
result['digests'][url] = self._get_digest(info) | |
# Now get other releases | |
for version, infos in d['releases'].items(): | |
if version == md.version: | |
continue # already done | |
omd = Metadata(scheme=self.scheme) | |
omd.name = md.name | |
omd.version = version | |
odist = Distribution(omd) | |
odist.locator = self | |
result[version] = odist | |
for info in infos: | |
url = info['url'] | |
odist.download_urls.add(url) | |
odist.digests[url] = self._get_digest(info) | |
result['urls'].setdefault(version, set()).add(url) | |
result['digests'][url] = self._get_digest(info) | |
# for info in urls: | |
# md.source_url = info['url'] | |
# dist.digest = self._get_digest(info) | |
# dist.locator = self | |
# for info in urls: | |
# url = info['url'] | |
# result['urls'].setdefault(md.version, set()).add(url) | |
# result['digests'][url] = self._get_digest(info) | |
except Exception as e: | |
self.errors.put(text_type(e)) | |
logger.exception('JSON fetch failed: %s', e) | |
return result | |
class Page(object): | |
""" | |
This class represents a scraped HTML page. | |
""" | |
# The following slightly hairy-looking regex just looks for the contents of | |
# an anchor link, which has an attribute "href" either immediately preceded | |
# or immediately followed by a "rel" attribute. The attribute values can be | |
# declared with double quotes, single quotes or no quotes - which leads to | |
# the length of the expression. | |
_href = re.compile(""" | |
(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)? | |
href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*)) | |
(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))? | |
""", re.I | re.S | re.X) | |
_base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S) | |
def __init__(self, data, url): | |
""" | |
Initialise an instance with the Unicode page contents and the URL they | |
came from. | |
""" | |
self.data = data | |
self.base_url = self.url = url | |
m = self._base.search(self.data) | |
if m: | |
self.base_url = m.group(1) | |
_clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I) | |
def links(self): | |
""" | |
Return the URLs of all the links on a page together with information | |
about their "rel" attribute, for determining which ones to treat as | |
downloads and which ones to queue for further scraping. | |
""" | |
def clean(url): | |
"Tidy up an URL." | |
scheme, netloc, path, params, query, frag = urlparse(url) | |
return urlunparse((scheme, netloc, quote(path), | |
params, query, frag)) | |
result = set() | |
for match in self._href.finditer(self.data): | |
d = match.groupdict('') | |
rel = (d['rel1'] or d['rel2'] or d['rel3'] or | |
d['rel4'] or d['rel5'] or d['rel6']) | |
url = d['url1'] or d['url2'] or d['url3'] | |
url = urljoin(self.base_url, url) | |
url = unescape(url) | |
url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url) | |
result.add((url, rel)) | |
# We sort the result, hoping to bring the most recent versions | |
# to the front | |
result = sorted(result, key=lambda t: t[0], reverse=True) | |
return result | |
class SimpleScrapingLocator(Locator): | |
""" | |
A locator which scrapes HTML pages to locate downloads for a distribution. | |
This runs multiple threads to do the I/O; performance is at least as good | |
as pip's PackageFinder, which works in an analogous fashion. | |
""" | |
# These are used to deal with various Content-Encoding schemes. | |
decoders = { | |
'deflate': zlib.decompress, | |
'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(b)).read(), | |
'none': lambda b: b, | |
} | |
def __init__(self, url, timeout=None, num_workers=10, **kwargs): | |
""" | |
Initialise an instance. | |
:param url: The root URL to use for scraping. | |
:param timeout: The timeout, in seconds, to be applied to requests. | |
This defaults to ``None`` (no timeout specified). | |
:param num_workers: The number of worker threads you want to do I/O, | |
This defaults to 10. | |
:param kwargs: Passed to the superclass. | |
""" | |
super(SimpleScrapingLocator, self).__init__(**kwargs) | |
self.base_url = ensure_slash(url) | |
self.timeout = timeout | |
self._page_cache = {} | |
self._seen = set() | |
self._to_fetch = queue.Queue() | |
self._bad_hosts = set() | |
self.skip_externals = False | |
self.num_workers = num_workers | |
self._lock = threading.RLock() | |
# See issue #45: we need to be resilient when the locator is used | |
# in a thread, e.g. with concurrent.futures. We can't use self._lock | |
# as it is for coordinating our internal threads - the ones created | |
# in _prepare_threads. | |
self._gplock = threading.RLock() | |
self.platform_check = False # See issue #112 | |
def _prepare_threads(self): | |
""" | |
Threads are created only when get_project is called, and terminate | |
before it returns. They are there primarily to parallelise I/O (i.e. | |
fetching web pages). | |
""" | |
self._threads = [] | |
for i in range(self.num_workers): | |
t = threading.Thread(target=self._fetch) | |
t.daemon = True | |
t.start() | |
self._threads.append(t) | |
def _wait_threads(self): | |
""" | |
Tell all the threads to terminate (by sending a sentinel value) and | |
wait for them to do so. | |
""" | |
# Note that you need two loops, since you can't say which | |
# thread will get each sentinel | |
for t in self._threads: | |
self._to_fetch.put(None) # sentinel | |
for t in self._threads: | |
t.join() | |
self._threads = [] | |
def _get_project(self, name): | |
result = {'urls': {}, 'digests': {}} | |
with self._gplock: | |
self.result = result | |
self.project_name = name | |
url = urljoin(self.base_url, '%s/' % quote(name)) | |
self._seen.clear() | |
self._page_cache.clear() | |
self._prepare_threads() | |
try: | |
logger.debug('Queueing %s', url) | |
self._to_fetch.put(url) | |
self._to_fetch.join() | |
finally: | |
self._wait_threads() | |
del self.result | |
return result | |
platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|' | |
r'win(32|_amd64)|macosx_?\d+)\b', re.I) | |
def _is_platform_dependent(self, url): | |
""" | |
Does an URL refer to a platform-specific download? | |
""" | |
return self.platform_dependent.search(url) | |
def _process_download(self, url): | |
""" | |
See if an URL is a suitable download for a project. | |
If it is, register information in the result dictionary (for | |
_get_project) about the specific version it's for. | |
Note that the return value isn't actually used other than as a boolean | |
value. | |
""" | |
if self.platform_check and self._is_platform_dependent(url): | |
info = None | |
else: | |
info = self.convert_url_to_download_info(url, self.project_name) | |
logger.debug('process_download: %s -> %s', url, info) | |
if info: | |
with self._lock: # needed because self.result is shared | |
self._update_version_data(self.result, info) | |
return info | |
def _should_queue(self, link, referrer, rel): | |
""" | |
Determine whether a link URL from a referring page and with a | |
particular "rel" attribute should be queued for scraping. | |
""" | |
scheme, netloc, path, _, _, _ = urlparse(link) | |
if path.endswith(self.source_extensions + self.binary_extensions + | |
self.excluded_extensions): | |
result = False | |
elif self.skip_externals and not link.startswith(self.base_url): | |
result = False | |
elif not referrer.startswith(self.base_url): | |
result = False | |
elif rel not in ('homepage', 'download'): | |
result = False | |
elif scheme not in ('http', 'https', 'ftp'): | |
result = False | |
elif self._is_platform_dependent(link): | |
result = False | |
else: | |
host = netloc.split(':', 1)[0] | |
if host.lower() == 'localhost': | |
result = False | |
else: | |
result = True | |
logger.debug('should_queue: %s (%s) from %s -> %s', link, rel, | |
referrer, result) | |
return result | |
def _fetch(self): | |
""" | |
Get a URL to fetch from the work queue, get the HTML page, examine its | |
links for download candidates and candidates for further scraping. | |
This is a handy method to run in a thread. | |
""" | |
while True: | |
url = self._to_fetch.get() | |
try: | |
if url: | |
page = self.get_page(url) | |
if page is None: # e.g. after an error | |
continue | |
for link, rel in page.links: | |
if link not in self._seen: | |
try: | |
self._seen.add(link) | |
if (not self._process_download(link) and | |
self._should_queue(link, url, rel)): | |
logger.debug('Queueing %s from %s', link, url) | |
self._to_fetch.put(link) | |
except MetadataInvalidError: # e.g. invalid versions | |
pass | |
except Exception as e: # pragma: no cover | |
self.errors.put(text_type(e)) | |
finally: | |
# always do this, to avoid hangs :-) | |
self._to_fetch.task_done() | |
if not url: | |
#logger.debug('Sentinel seen, quitting.') | |
break | |
def get_page(self, url): | |
""" | |
Get the HTML for an URL, possibly from an in-memory cache. | |
XXX TODO Note: this cache is never actually cleared. It's assumed that | |
the data won't get stale over the lifetime of a locator instance (not | |
necessarily true for the default_locator). | |
""" | |
# http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api | |
scheme, netloc, path, _, _, _ = urlparse(url) | |
if scheme == 'file' and os.path.isdir(url2pathname(path)): | |
url = urljoin(ensure_slash(url), 'index.html') | |
if url in self._page_cache: | |
result = self._page_cache[url] | |
logger.debug('Returning %s from cache: %s', url, result) | |
else: | |
host = netloc.split(':', 1)[0] | |
result = None | |
if host in self._bad_hosts: | |
logger.debug('Skipping %s due to bad host %s', url, host) | |
else: | |
req = Request(url, headers={'Accept-encoding': 'identity'}) | |
try: | |
logger.debug('Fetching %s', url) | |
resp = self.opener.open(req, timeout=self.timeout) | |
logger.debug('Fetched %s', url) | |
headers = resp.info() | |
content_type = headers.get('Content-Type', '') | |
if HTML_CONTENT_TYPE.match(content_type): | |
final_url = resp.geturl() | |
data = resp.read() | |
encoding = headers.get('Content-Encoding') | |
if encoding: | |
decoder = self.decoders[encoding] # fail if not found | |
data = decoder(data) | |
encoding = 'utf-8' | |
m = CHARSET.search(content_type) | |
if m: | |
encoding = m.group(1) | |
try: | |
data = data.decode(encoding) | |
except UnicodeError: # pragma: no cover | |
data = data.decode('latin-1') # fallback | |
result = Page(data, final_url) | |
self._page_cache[final_url] = result | |
except HTTPError as e: | |
if e.code != 404: | |
logger.exception('Fetch failed: %s: %s', url, e) | |
except URLError as e: # pragma: no cover | |
logger.exception('Fetch failed: %s: %s', url, e) | |
with self._lock: | |
self._bad_hosts.add(host) | |
except Exception as e: # pragma: no cover | |
logger.exception('Fetch failed: %s: %s', url, e) | |
finally: | |
self._page_cache[url] = result # even if None (failure) | |
return result | |
_distname_re = re.compile('<a href=[^>]*>([^<]+)<') | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
result = set() | |
page = self.get_page(self.base_url) | |
if not page: | |
raise DistlibException('Unable to get %s' % self.base_url) | |
for match in self._distname_re.finditer(page.data): | |
result.add(match.group(1)) | |
return result | |
class DirectoryLocator(Locator): | |
""" | |
This class locates distributions in a directory tree. | |
""" | |
def __init__(self, path, **kwargs): | |
""" | |
Initialise an instance. | |
:param path: The root of the directory tree to search. | |
:param kwargs: Passed to the superclass constructor, | |
except for: | |
* recursive - if True (the default), subdirectories are | |
recursed into. If False, only the top-level directory | |
is searched, | |
""" | |
self.recursive = kwargs.pop('recursive', True) | |
super(DirectoryLocator, self).__init__(**kwargs) | |
path = os.path.abspath(path) | |
if not os.path.isdir(path): # pragma: no cover | |
raise DistlibException('Not a directory: %r' % path) | |
self.base_dir = path | |
def should_include(self, filename, parent): | |
""" | |
Should a filename be considered as a candidate for a distribution | |
archive? As well as the filename, the directory which contains it | |
is provided, though not used by the current implementation. | |
""" | |
return filename.endswith(self.downloadable_extensions) | |
def _get_project(self, name): | |
result = {'urls': {}, 'digests': {}} | |
for root, dirs, files in os.walk(self.base_dir): | |
for fn in files: | |
if self.should_include(fn, root): | |
fn = os.path.join(root, fn) | |
url = urlunparse(('file', '', | |
pathname2url(os.path.abspath(fn)), | |
'', '', '')) | |
info = self.convert_url_to_download_info(url, name) | |
if info: | |
self._update_version_data(result, info) | |
if not self.recursive: | |
break | |
return result | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
result = set() | |
for root, dirs, files in os.walk(self.base_dir): | |
for fn in files: | |
if self.should_include(fn, root): | |
fn = os.path.join(root, fn) | |
url = urlunparse(('file', '', | |
pathname2url(os.path.abspath(fn)), | |
'', '', '')) | |
info = self.convert_url_to_download_info(url, None) | |
if info: | |
result.add(info['name']) | |
if not self.recursive: | |
break | |
return result | |
class JSONLocator(Locator): | |
""" | |
This locator uses special extended metadata (not available on PyPI) and is | |
the basis of performant dependency resolution in distlib. Other locators | |
require archive downloads before dependencies can be determined! As you | |
might imagine, that can be slow. | |
""" | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
raise NotImplementedError('Not available from this locator') | |
def _get_project(self, name): | |
result = {'urls': {}, 'digests': {}} | |
data = get_project_data(name) | |
if data: | |
for info in data.get('files', []): | |
if info['ptype'] != 'sdist' or info['pyversion'] != 'source': | |
continue | |
# We don't store summary in project metadata as it makes | |
# the data bigger for no benefit during dependency | |
# resolution | |
dist = make_dist(data['name'], info['version'], | |
summary=data.get('summary', | |
'Placeholder for summary'), | |
scheme=self.scheme) | |
md = dist.metadata | |
md.source_url = info['url'] | |
# TODO SHA256 digest | |
if 'digest' in info and info['digest']: | |
dist.digest = ('md5', info['digest']) | |
md.dependencies = info.get('requirements', {}) | |
dist.exports = info.get('exports', {}) | |
result[dist.version] = dist | |
result['urls'].setdefault(dist.version, set()).add(info['url']) | |
return result | |
class DistPathLocator(Locator): | |
""" | |
This locator finds installed distributions in a path. It can be useful for | |
adding to an :class:`AggregatingLocator`. | |
""" | |
def __init__(self, distpath, **kwargs): | |
""" | |
Initialise an instance. | |
:param distpath: A :class:`DistributionPath` instance to search. | |
""" | |
super(DistPathLocator, self).__init__(**kwargs) | |
assert isinstance(distpath, DistributionPath) | |
self.distpath = distpath | |
def _get_project(self, name): | |
dist = self.distpath.get_distribution(name) | |
if dist is None: | |
result = {'urls': {}, 'digests': {}} | |
else: | |
result = { | |
dist.version: dist, | |
'urls': {dist.version: set([dist.source_url])}, | |
'digests': {dist.version: set([None])} | |
} | |
return result | |
class AggregatingLocator(Locator): | |
""" | |
This class allows you to chain and/or merge a list of locators. | |
""" | |
def __init__(self, *locators, **kwargs): | |
""" | |
Initialise an instance. | |
:param locators: The list of locators to search. | |
:param kwargs: Passed to the superclass constructor, | |
except for: | |
* merge - if False (the default), the first successful | |
search from any of the locators is returned. If True, | |
the results from all locators are merged (this can be | |
slow). | |
""" | |
self.merge = kwargs.pop('merge', False) | |
self.locators = locators | |
super(AggregatingLocator, self).__init__(**kwargs) | |
def clear_cache(self): | |
super(AggregatingLocator, self).clear_cache() | |
for locator in self.locators: | |
locator.clear_cache() | |
def _set_scheme(self, value): | |
self._scheme = value | |
for locator in self.locators: | |
locator.scheme = value | |
scheme = property(Locator.scheme.fget, _set_scheme) | |
def _get_project(self, name): | |
result = {} | |
for locator in self.locators: | |
d = locator.get_project(name) | |
if d: | |
if self.merge: | |
files = result.get('urls', {}) | |
digests = result.get('digests', {}) | |
# next line could overwrite result['urls'], result['digests'] | |
result.update(d) | |
df = result.get('urls') | |
if files and df: | |
for k, v in files.items(): | |
if k in df: | |
df[k] |= v | |
else: | |
df[k] = v | |
dd = result.get('digests') | |
if digests and dd: | |
dd.update(digests) | |
else: | |
# See issue #18. If any dists are found and we're looking | |
# for specific constraints, we only return something if | |
# a match is found. For example, if a DirectoryLocator | |
# returns just foo (1.0) while we're looking for | |
# foo (>= 2.0), we'll pretend there was nothing there so | |
# that subsequent locators can be queried. Otherwise we | |
# would just return foo (1.0) which would then lead to a | |
# failure to find foo (>= 2.0), because other locators | |
# weren't searched. Note that this only matters when | |
# merge=False. | |
if self.matcher is None: | |
found = True | |
else: | |
found = False | |
for k in d: | |
if self.matcher.match(k): | |
found = True | |
break | |
if found: | |
result = d | |
break | |
return result | |
def get_distribution_names(self): | |
""" | |
Return all the distribution names known to this locator. | |
""" | |
result = set() | |
for locator in self.locators: | |
try: | |
result |= locator.get_distribution_names() | |
except NotImplementedError: | |
pass | |
return result | |
# We use a legacy scheme simply because most of the dists on PyPI use legacy | |
# versions which don't conform to PEP 440. | |
default_locator = AggregatingLocator( | |
# JSONLocator(), # don't use as PEP 426 is withdrawn | |
SimpleScrapingLocator('https://pypi.org/simple/', | |
timeout=3.0), | |
scheme='legacy') | |
locate = default_locator.locate | |
class DependencyFinder(object): | |
""" | |
Locate dependencies for distributions. | |
""" | |
def __init__(self, locator=None): | |
""" | |
Initialise an instance, using the specified locator | |
to locate distributions. | |
""" | |
self.locator = locator or default_locator | |
self.scheme = get_scheme(self.locator.scheme) | |
def add_distribution(self, dist): | |
""" | |
Add a distribution to the finder. This will update internal information | |
about who provides what. | |
:param dist: The distribution to add. | |
""" | |
logger.debug('adding distribution %s', dist) | |
name = dist.key | |
self.dists_by_name[name] = dist | |
self.dists[(name, dist.version)] = dist | |
for p in dist.provides: | |
name, version = parse_name_and_version(p) | |
logger.debug('Add to provided: %s, %s, %s', name, version, dist) | |
self.provided.setdefault(name, set()).add((version, dist)) | |
def remove_distribution(self, dist): | |
""" | |
Remove a distribution from the finder. This will update internal | |
information about who provides what. | |
:param dist: The distribution to remove. | |
""" | |
logger.debug('removing distribution %s', dist) | |
name = dist.key | |
del self.dists_by_name[name] | |
del self.dists[(name, dist.version)] | |
for p in dist.provides: | |
name, version = parse_name_and_version(p) | |
logger.debug('Remove from provided: %s, %s, %s', name, version, dist) | |
s = self.provided[name] | |
s.remove((version, dist)) | |
if not s: | |
del self.provided[name] | |
def get_matcher(self, reqt): | |
""" | |
Get a version matcher for a requirement. | |
:param reqt: The requirement | |
:type reqt: str | |
:return: A version matcher (an instance of | |
:class:`distlib.version.Matcher`). | |
""" | |
try: | |
matcher = self.scheme.matcher(reqt) | |
except UnsupportedVersionError: # pragma: no cover | |
# XXX compat-mode if cannot read the version | |
name = reqt.split()[0] | |
matcher = self.scheme.matcher(name) | |
return matcher | |
def find_providers(self, reqt): | |
""" | |
Find the distributions which can fulfill a requirement. | |
:param reqt: The requirement. | |
:type reqt: str | |
:return: A set of distribution which can fulfill the requirement. | |
""" | |
matcher = self.get_matcher(reqt) | |
name = matcher.key # case-insensitive | |
result = set() | |
provided = self.provided | |
if name in provided: | |
for version, provider in provided[name]: | |
try: | |
match = matcher.match(version) | |
except UnsupportedVersionError: | |
match = False | |
if match: | |
result.add(provider) | |
break | |
return result | |
def try_to_replace(self, provider, other, problems): | |
""" | |
Attempt to replace one provider with another. This is typically used | |
when resolving dependencies from multiple sources, e.g. A requires | |
(B >= 1.0) while C requires (B >= 1.1). | |
For successful replacement, ``provider`` must meet all the requirements | |
which ``other`` fulfills. | |
:param provider: The provider we are trying to replace with. | |
:param other: The provider we're trying to replace. | |
:param problems: If False is returned, this will contain what | |
problems prevented replacement. This is currently | |
a tuple of the literal string 'cantreplace', | |
``provider``, ``other`` and the set of requirements | |
that ``provider`` couldn't fulfill. | |
:return: True if we can replace ``other`` with ``provider``, else | |
False. | |
""" | |
rlist = self.reqts[other] | |
unmatched = set() | |
for s in rlist: | |
matcher = self.get_matcher(s) | |
if not matcher.match(provider.version): | |
unmatched.add(s) | |
if unmatched: | |
# can't replace other with provider | |
problems.add(('cantreplace', provider, other, | |
frozenset(unmatched))) | |
result = False | |
else: | |
# can replace other with provider | |
self.remove_distribution(other) | |
del self.reqts[other] | |
for s in rlist: | |
self.reqts.setdefault(provider, set()).add(s) | |
self.add_distribution(provider) | |
result = True | |
return result | |
def find(self, requirement, meta_extras=None, prereleases=False): | |
""" | |
Find a distribution and all distributions it depends on. | |
:param requirement: The requirement specifying the distribution to | |
find, or a Distribution instance. | |
:param meta_extras: A list of meta extras such as :test:, :build: and | |
so on. | |
:param prereleases: If ``True``, allow pre-release versions to be | |
returned - otherwise, don't return prereleases | |
unless they're all that's available. | |
Return a set of :class:`Distribution` instances and a set of | |
problems. | |
The distributions returned should be such that they have the | |
:attr:`required` attribute set to ``True`` if they were | |
from the ``requirement`` passed to ``find()``, and they have the | |
:attr:`build_time_dependency` attribute set to ``True`` unless they | |
are post-installation dependencies of the ``requirement``. | |
The problems should be a tuple consisting of the string | |
``'unsatisfied'`` and the requirement which couldn't be satisfied | |
by any distribution known to the locator. | |
""" | |
self.provided = {} | |
self.dists = {} | |
self.dists_by_name = {} | |
self.reqts = {} | |
meta_extras = set(meta_extras or []) | |
if ':*:' in meta_extras: | |
meta_extras.remove(':*:') | |
# :meta: and :run: are implicitly included | |
meta_extras |= set([':test:', ':build:', ':dev:']) | |
if isinstance(requirement, Distribution): | |
dist = odist = requirement | |
logger.debug('passed %s as requirement', odist) | |
else: | |
dist = odist = self.locator.locate(requirement, | |
prereleases=prereleases) | |
if dist is None: | |
raise DistlibException('Unable to locate %r' % requirement) | |
logger.debug('located %s', odist) | |
dist.requested = True | |
problems = set() | |
todo = set([dist]) | |
install_dists = set([odist]) | |
while todo: | |
dist = todo.pop() | |
name = dist.key # case-insensitive | |
if name not in self.dists_by_name: | |
self.add_distribution(dist) | |
else: | |
#import pdb; pdb.set_trace() | |
other = self.dists_by_name[name] | |
if other != dist: | |
self.try_to_replace(dist, other, problems) | |
ireqts = dist.run_requires | dist.meta_requires | |
sreqts = dist.build_requires | |
ereqts = set() | |
if meta_extras and dist in install_dists: | |
for key in ('test', 'build', 'dev'): | |
e = ':%s:' % key | |
if e in meta_extras: | |
ereqts |= getattr(dist, '%s_requires' % key) | |
all_reqts = ireqts | sreqts | ereqts | |
for r in all_reqts: | |
providers = self.find_providers(r) | |
if not providers: | |
logger.debug('No providers found for %r', r) | |
provider = self.locator.locate(r, prereleases=prereleases) | |
# If no provider is found and we didn't consider | |
# prereleases, consider them now. | |
if provider is None and not prereleases: | |
provider = self.locator.locate(r, prereleases=True) | |
if provider is None: | |
logger.debug('Cannot satisfy %r', r) | |
problems.add(('unsatisfied', r)) | |
else: | |
n, v = provider.key, provider.version | |
if (n, v) not in self.dists: | |
todo.add(provider) | |
providers.add(provider) | |
if r in ireqts and dist in install_dists: | |
install_dists.add(provider) | |
logger.debug('Adding %s to install_dists', | |
provider.name_and_version) | |
for p in providers: | |
name = p.key | |
if name not in self.dists_by_name: | |
self.reqts.setdefault(p, set()).add(r) | |
else: | |
other = self.dists_by_name[name] | |
if other != p: | |
# see if other can be replaced by p | |
self.try_to_replace(p, other, problems) | |
dists = set(self.dists.values()) | |
for dist in dists: | |
dist.build_time_dependency = dist not in install_dists | |
if dist.build_time_dependency: | |
logger.debug('%s is a build-time dependency only.', | |
dist.name_and_version) | |
logger.debug('find done for %s', odist) | |
return dists, problems | |