Spaces:
Sleeping
Sleeping
| """ | |
| _warn_once.py — De-duplicate repeated warnings for the same file. | |
| parser.py, graph_builder.py, and resolver.py each independently call | |
| ast.parse() on every file. When a file has a syntax error, all three would | |
| otherwise log their own identical warning. This module tracks which files | |
| have already been warned about (per process run) so only the first one | |
| actually prints. | |
| Also covers invalid-UTF-8 source files: these are read with | |
| errors="ignore", which silently DROPS any byte that isn't valid UTF-8 with | |
| no warning, anywhere. A dropped byte inside a string literal corrupts that | |
| literal's contents (e.g. "Café Menu" -> "Caf Menu") without raising any | |
| error or appearing in any log -- the file still parses fine, the symbol | |
| still extracts fine, the code text is just silently wrong. warn_encoding_ | |
| issue_once exists so this corruption is at least visible once per file. | |
| """ | |
| import logging | |
| import os | |
| import threading | |
| from typing import Set | |
| _warned_files: Set[str] = set() | |
| _warned_encoding_files: Set[str] = set() | |
| _lock = threading.Lock() | |
| def warn_syntax_error_once(logger: logging.Logger, filename: str, exc: SyntaxError) -> None: | |
| """Log a SyntaxError warning for `filename`, but only the first time it's seen.""" | |
| key = os.path.abspath(filename) | |
| with _lock: | |
| if key in _warned_files: | |
| return | |
| _warned_files.add(key) | |
| logger.warning( | |
| "\033[93mSkipping %s due to SyntaxError: %s (line %s)\033[0m", | |
| os.path.basename(filename), exc.msg, exc.lineno, | |
| ) | |
| def check_and_warn_encoding(logger: logging.Logger, filename: str, raw_bytes: bytes) -> None: | |
| """ | |
| Check whether `raw_bytes` is valid UTF-8. If not, warn once per file -- | |
| the caller will go on to decode with errors="ignore", which silently | |
| drops the offending bytes (and anything that decoded incorrectly around | |
| them). This doesn't stop processing; it just makes the data loss | |
| visible instead of completely silent. | |
| """ | |
| try: | |
| raw_bytes.decode("utf-8") | |
| return | |
| except UnicodeDecodeError as e: | |
| key = os.path.abspath(filename) | |
| with _lock: | |
| if key in _warned_encoding_files: | |
| return | |
| _warned_encoding_files.add(key) | |
| logger.warning( | |
| "\033[93m%s is not valid UTF-8 (%s at byte %d) -- invalid bytes " | |
| "will be silently dropped, which can corrupt string literals " | |
| "in the extracted code\033[0m", | |
| os.path.basename(filename), e.reason, e.start, | |
| ) | |
| def reset_warned_files() -> None: | |
| """Clear the de-dup caches. Mainly useful for tests.""" | |
| with _lock: | |
| _warned_files.clear() | |
| _warned_encoding_files.clear() |