"""Find ip addresses in text. # Code forked from # https://github.com/bigcode-project/pii-lib/blob/main/utils/emails_ip_addresses_detection.py # under the Apache 2.0 License. """ import ipaddress from typing import Iterator import regex from ..data.dataset_utils import lilac_span from ..schema import Item ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}' # noqa: E501 ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])' # noqa: E501 ip_pattern = regex.compile( (r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(' + r'|'.join([ipv4_pattern, ipv6_pattern]) + ')(?:$|[\\s@,?!;:\'\"(.\\p{Han}])')) year_patterns = [ regex.compile( r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])" ), # yyyy-yyyy or yyyy/yyyy regex.compile( r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])" ), # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm regex.compile( r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])" ), # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy regex.compile( r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])" ), # mm-yyyy or mm/yyyy or the same but with yy regex.compile( r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])" ), # yyyy-mm or yyyy/mm ] def _ip_has_digit(matched_str: str) -> bool: """Checks to make sure the PII span is not just ::.""" return any(map(str.isdigit, matched_str)) def _matches_date_pattern(matched_str: str) -> bool: # Screen out date false positives. for year_regex in year_patterns: if year_regex.match(matched_str): return True return False def _filter_versions(matched_str: str, context: str) -> bool: """Filter x.x.x.x and the words dns/server don't appear in the context.""" # count occurrence of dots. dot_count = matched_str.count('.') exclude = (dot_count == 3 and len(matched_str) == 7) if exclude: if 'dns' in context.lower() or 'server' in context.lower(): return False return exclude def _not_ip_address(matched_str: str) -> bool: """Make sure the string has a valid IP address format e.g: 33.01.33.33 is not a valid.""" try: ipaddress.ip_address(matched_str) return False except ValueError: return True def find_ip_addresses(text: str) -> Iterator[Item]: """Find IP addresses in the text.""" for match in ip_pattern.finditer(text): if not match.groups(): continue value = match.group(1) start, end = match.span(1) # Filter out false positive IPs if not _ip_has_digit(value): continue if _matches_date_pattern(value): continue if _filter_versions(value, text[start - 100:end + 100]) or _not_ip_address(value): continue yield lilac_span(start, end)