import re URL_REGEX = re.compile( r"(?:^|(?= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)" # domain name r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*" # TLD identifier r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:\/[^\)\]\}\s]*)?", # r"(?:$|(?![\w?!+&\/\)]))", # @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful? # But I made sure that it does not include ), ] and } in the URL. flags=re.UNICODE | re.IGNORECASE, )