import re import sys import locale import io def re_findall(pattern, string): return [m.groupdict() for m in re.finditer(pattern, string)] def jaccard(x1, x2, y1, y2): # Calculate jaccard index intersection = max(0, min(x2, y2)-max(x1, y1)) filled_union = max(x2, y2) - min(x1, y1) return intersection/filled_union if filled_union > 0 else 0 def regex_search(text, pattern, group=1, default=None): match = re.search(pattern, text) return match.group(group) if match else default def _windows_write_string(s, out, skip_errors=True): """ Returns True if the string was written using special methods, False if it has yet to be written out.""" # Adapted from http://stackoverflow.com/a/3259271/35070 import ctypes import ctypes.wintypes WIN_OUTPUT_IDS = { 1: -11, 2: -12, } try: fileno = out.fileno() except AttributeError: # If the output stream doesn't have a fileno, it's virtual return False except io.UnsupportedOperation: # Some strange Windows pseudo files? return False if fileno not in WIN_OUTPUT_IDS: return False GetStdHandle = ctypes.WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( ('GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) WriteConsoleW = ctypes.WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)( ('GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 GetConsoleMode = ctypes.WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( ('GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False def next_nonbmp_pos(s): try: return next(i for i, c in enumerate(s) if ord(c) > 0xffff) except StopIteration: return len(s) while s: count = min(next_nonbmp_pos(s), 1024) ret = WriteConsoleW( h, s, count if count else 2, ctypes.byref(written), None) if ret == 0: if skip_errors: continue else: raise OSError('Failed to write string') if not count: # We just wrote a non-BMP character assert written.value == 2 s = s[1:] else: assert written.value > 0 s = s[written.value:] return True def preferredencoding(): """Get preferred encoding. Returns the best encoding scheme for the system, based on locale.getpreferredencoding() and some further tweaks. """ try: pref = locale.getpreferredencoding() 'TEST'.encode(pref) except Exception: pref = 'utf-8' return pref def safe_print(*objects, sep=' ', end='\n', out=None, encoding=None, flush=False): """ Ensure printing to standard output can be done safely (especially on Windows). There are usually issues with printing emojis and non utf-8 characters. """ output_string = sep.join(map(lambda x: str(x), objects)) + end if out is None: out = sys.stdout if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'): if _windows_write_string(output_string, out): return if 'b' in getattr(out, 'mode', '') or not hasattr(out, 'buffer'): out.write(output_string) else: enc = encoding or getattr(out, 'encoding', None) or preferredencoding() byt = output_string.encode(enc, 'ignore') out.buffer.write(byt) if flush and hasattr(out, 'flush'): out.flush()