Diffcontext / diffcontext /_warn_once.py
trakshan-mishra
Deploy FastAPI & MCP server over SSE
036a2db
Raw
History Blame Contribute Delete
2.71 kB
"""
_warn_once.py — De-duplicate repeated warnings for the same file.
parser.py, graph_builder.py, and resolver.py each independently call
ast.parse() on every file. When a file has a syntax error, all three would
otherwise log their own identical warning. This module tracks which files
have already been warned about (per process run) so only the first one
actually prints.
Also covers invalid-UTF-8 source files: these are read with
errors="ignore", which silently DROPS any byte that isn't valid UTF-8 with
no warning, anywhere. A dropped byte inside a string literal corrupts that
literal's contents (e.g. "Café Menu" -> "Caf Menu") without raising any
error or appearing in any log -- the file still parses fine, the symbol
still extracts fine, the code text is just silently wrong. warn_encoding_
issue_once exists so this corruption is at least visible once per file.
"""
import logging
import os
import threading
from typing import Set
_warned_files: Set[str] = set()
_warned_encoding_files: Set[str] = set()
_lock = threading.Lock()
def warn_syntax_error_once(logger: logging.Logger, filename: str, exc: SyntaxError) -> None:
"""Log a SyntaxError warning for `filename`, but only the first time it's seen."""
key = os.path.abspath(filename)
with _lock:
if key in _warned_files:
return
_warned_files.add(key)
logger.warning(
"\033[93mSkipping %s due to SyntaxError: %s (line %s)\033[0m",
os.path.basename(filename), exc.msg, exc.lineno,
)
def check_and_warn_encoding(logger: logging.Logger, filename: str, raw_bytes: bytes) -> None:
"""
Check whether `raw_bytes` is valid UTF-8. If not, warn once per file --
the caller will go on to decode with errors="ignore", which silently
drops the offending bytes (and anything that decoded incorrectly around
them). This doesn't stop processing; it just makes the data loss
visible instead of completely silent.
"""
try:
raw_bytes.decode("utf-8")
return
except UnicodeDecodeError as e:
key = os.path.abspath(filename)
with _lock:
if key in _warned_encoding_files:
return
_warned_encoding_files.add(key)
logger.warning(
"\033[93m%s is not valid UTF-8 (%s at byte %d) -- invalid bytes "
"will be silently dropped, which can corrupt string literals "
"in the extracted code\033[0m",
os.path.basename(filename), e.reason, e.start,
)
def reset_warned_files() -> None:
"""Clear the de-dup caches. Mainly useful for tests."""
with _lock:
_warned_files.clear()
_warned_encoding_files.clear()