diff --git a/.gitignore b/.gitignore index 1da09a88cfa0ea5df2e0601f6fd830c9c78ce209..a1d162828db1f6c3431fa1d585a46743a23d5c78 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,13 @@ +# ============================================================================= # Python +# ============================================================================= __pycache__/ *.py[cod] *$py.class *.so .Python + +# Distribution / packaging build/ develop-eggs/ dist/ @@ -20,32 +24,96 @@ wheels/ .installed.cfg *.egg MANIFEST +pip-log.txt +pip-delete-this-directory.txt -# Virtual Environments +# ============================================================================= +# Virtual environments / dependency managers +# ============================================================================= venv/ env/ .env +.env.* +!.env.example .venv/ +.python-version +.tool-versions + +# ============================================================================= +# Test / type / lint caches +# ============================================================================= +.pytest_cache/ +.cache/ +.coverage +.coverage.* +htmlcov/ +coverage.xml +.tox/ +.nox/ +.mypy_cache/ +.ruff_cache/ +.pyre/ +.pytype/ -# IDEs +# ============================================================================= +# IDEs / editors +# ============================================================================= .idea/ .vscode/ *.swp *.swo +*~ +*.iml +.project +.pydevproject -# macOS +# ============================================================================= +# OS noise +# ============================================================================= .DS_Store .AppleDouble .LSOverride +Thumbs.db +desktop.ini -# Project Specific Outputs -*.txt -*.md -!README.md +# ============================================================================= +# Gradio / Hugging Face Spaces +# ============================================================================= +.gradio/ +gradio_cached_examples/ +flagged/ + +# ============================================================================= +# BibGuard outputs (generated by main.py / app.py) +# ============================================================================= +bibguard_output/ +*_only_used.bib *_only_used_entry.bib +bibliography_report.md +latex_quality_report.md +line_by_line_report.md +report.html +report.json +# Local HTTP cache used by src/utils/http.py +.cache/bibguard/ +**/.cache/bibguard/ + +# ============================================================================= +# User secrets / personal config +# Recommendation: ship `bibguard.example.yaml` and gitignore the real one +# so API keys / personal paths don't leak. See README for details. +# ============================================================================= +# bibguard.yaml +config.yaml +.bibguard.yaml +.bibguard.yml +secrets.yaml +*.local.yaml -# LaTeX and Bibliography (User Data) -# Ignoring these to prevent committing personal paper content +# ============================================================================= +# User paper data (LaTeX / BibTeX sources and build artifacts) +# Keep README.md, requirements*.txt, and source-tree .md files. +# ============================================================================= *.tex *.bib *.pdf @@ -57,6 +125,22 @@ env/ *.synctex.gz *.fls *.fdb_latexmk +*.toc +*.lof +*.lot +*.nav +*.snm +*.vrb -# cache -.cache \ No newline at end of file +# Markdown / text files: ignore by default to prevent committing user paper +# content, but keep documentation and project metadata. +*.txt +*.md +!README.md +!CHANGELOG.md +!CONTRIBUTING.md +!LICENSE.md +!docs/**/*.md +!requirements.txt +!requirements-*.txt +!**/requirements.txt diff --git a/README.md b/README.md index 46aae254400834afcaa15da7b670646db9b23541..7a69b636b341ef8c70a192eafcc735c1eba698df 100644 --- a/README.md +++ b/README.md @@ -11,35 +11,46 @@ pinned: false # BibGuard: Bibliography & LaTeX Quality Auditor -**BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit. +**BibGuard** is a comprehensive quality-assurance tool for academic papers. It validates every bibliography entry against real-world databases, checks LaTeX submission quality, flags retracted DOIs and broken URLs, and uses an LLM (optional) to verify that cited papers actually support your claims. -AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims. +AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and produces a single, beautiful, self-contained HTML report you can open offline. ## πŸ›‘ Why BibGuard? -- **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata -- **πŸ“‹ LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems -- **πŸ”’ Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated -- **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM) -- **⚑ Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations +- **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata +- **🚫 Catch Retractions**: Detect references to papers that have been retracted or are under "expression of concern" +- **πŸ”— Detect Broken URLs**: HEAD-check `entry.url` to find dead links before reviewers do +- **πŸ“‹ LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, double-blind compliance, AI-text artifacts +- **πŸ”’ Safe & Non-Destructive**: Your original files are **never modified** β€” only reports are generated +- **🧠 Contextual Relevance** *(optional, with LLM)*: Score each citation 1-5 and tag its role (baseline/method/dataset/counterexample/survey/motivation/other) +- **⚑ Re-runs are fast**: SQLite-backed HTTP cache + auto-retry mean the second run on the same paper completes in seconds ## πŸš€ Features ### Bibliography Validation -- **πŸ” Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar -- **πŸ€– AI Relevance Check**: Uses LLMs to verify citations match their context (optional) -- **πŸ“Š Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.) -- **πŸ‘€ Usage Analysis**: Highlights missing citations and unused bib entries -- **πŸ‘― Duplicate Detector**: Identifies duplicate entries with fuzzy matching +- **πŸ” Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar +- **🚫 Retraction Detection**: Flags retracted/withdrawn DOIs via CrossRef's `update-to` relation +- **πŸ”— URL Liveness Check**: Optional HEAD-then-GET check on every `entry.url` +- **πŸ“Š Preprint Detection**: Warns if >50% of references are preprints, and suggests published versions when arXiv records them +- **πŸ‘€ Usage Analysis**: Highlights missing citations and unused bib entries +- **πŸ‘― Duplicate Detection**: Identifies duplicate entries with fuzzy matching +- **πŸ€– AI Relevance + Role Tagging** *(optional)*: 1-5 relevance score plus citation role classification ### LaTeX Quality Checks -- **πŸ“ Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation -- **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases -- **πŸ”€ Consistency**: Spelling variants (US/UK English), hyphenation, terminology -- **πŸ€– AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants -- **πŸ”  Acronym Validation**: Ensures acronyms are defined before use (smart matching) -- **🎭 Anonymization**: Checks for identity leaks in double-blind submissions -- **πŸ“… Citation Age**: Flags references older than 30 years +- **πŸ“ Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation +- **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases +- **πŸ”€ Consistency**: Spelling variants (US/UK English), hyphenation, terminology β€” augmentable via project glossary +- **πŸ€– AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants +- **πŸ”  Acronym Validation**: Ensures acronyms are defined before use, with a project-glossary skip list +- **🎭 Anonymization**: Checks for identity leaks in double-blind submissions +- **πŸ“… Citation Age**: Flags references older than 30 years +- **πŸŽ“ Conference Templates**: Mandatory-section and style-package checks for ACL, EMNLP, NAACL, CVPR, ICCV, ECCV, NeurIPS, ICML, ICLR + +### Outputs +- πŸ“„ **Markdown reports** β€” bibliography validation + LaTeX quality issues +- 🌐 **Self-contained HTML** β€” dark mode, full-text search, per-section severity filters, inline highlighting of the offending span on each LaTeX issue. Opens offline, no server required +- πŸ€– **JSON** for CI / scripts / custom dashboards +- 🧹 **Cleaned `.bib`** containing only entries actually cited in the paper ## πŸ“¦ Installation @@ -57,10 +68,9 @@ pip install -r requirements.txt python main.py --init ``` -This creates `config.yaml`. Edit it to set your file paths. You have two modes: +This creates `config.yaml`. Edit it to point at your `.bib` and `.tex` files. -#### Option A: Single File Mode -Best for individual papers. +#### Single File Mode ```yaml files: bib: "paper.bib" @@ -68,141 +78,186 @@ files: output_dir: "bibguard_output" ``` -#### Option B: Directory Scan Mode -Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files. +#### Directory Scan Mode +For projects with multiple `.tex` and `.bib` files: ```yaml files: input_dir: "./my_project_dir" output_dir: "bibguard_output" ``` -### 2. Run Full Check +### 2. Run a Check ```bash -python main.py +python main.py # full check using config.yaml / bibguard.yaml +python main.py --quick # local-only checks (no network, instant) +python main.py --format json,html # pick output formats +python main.py --verbose # DEBUG logs to stderr +python main.py --config my.yaml # custom config path +python main.py --list-templates # list conference templates ``` -**Output** (in `bibguard_output/`): -- `bibliography_report.md` - Bibliography validation results -- `latex_quality_report.md` - Writing and formatting issues -- `line_by_line_report.md` - All issues sorted by line number -- `*_only_used.bib` - Clean bibliography (used entries only) +**Default outputs** (in `bibguard_output/`): +- `report.html` β€” single self-contained HTML, opens offline, dark-mode aware +- `report.json` β€” full machine-readable dump (only when `json` is in `output.formats`) +- `bibliography_report.md` β€” bibliography validation, with corroboration notes +- `latex_quality_report.md` β€” LaTeX quality issues, errors / warnings / suggestions, full line content with the offending span bolded +- `_only_used.bib` β€” clean bibliography of cited entries only ## πŸ›  Configuration -Edit `config.yaml` to customize checks: +`bibguard.yaml` (or `config.yaml`) contains the following sections: ```yaml +files: + bib: "paper.bib" + tex: "paper.tex" + output_dir: "bibguard_output" + +network: + contact_email: "" # used in polite-pool User-Agent for arXiv/CrossRef/OpenAlex + cache_enabled: true # local SQLite cache for HTTP responses (~/.cache/bibguard) + cache_ttl_hours: 24 + retry_total: 5 # auto-retry on 429/5xx with exponential backoff + retry_backoff_factor: 1.5 + +template: "" # acl | emnlp | naacl | cvpr | iccv | eccv | neurips | icml | iclr + bibliography: - check_metadata: true # Validate against online databases (takes time) - check_usage: true # Find unused/missing entries - check_duplicates: true # Detect duplicate entries - check_preprint_ratio: true # Warn if >50% are preprints + check_metadata: true # verify against online databases (slow on first run, fast on repeats) + check_usage: true # find unused entries / missing citations + check_duplicates: true + check_preprint_ratio: true # warn if >50% of references are preprints check_relevance: false # LLM-based relevance check (requires API key) -submission: - # Format checks - caption: true # Table/figure caption placement - reference: true # Cross-reference integrity - formatting: true # Citation spacing, blank lines - equation: true # Equation punctuation, numbering - - # Writing quality - sentence: true # Weak starters, hedging language - consistency: true # Spelling, hyphenation, terminology - acronym: true # Acronym definitions (3+ letters) - - # Submission compliance - ai_artifacts: true # AI-generated text detection - anonymization: true # Double-blind compliance - citation_quality: true # Old citations (>30 years) - number: true # Percentage formatting +submission_extra: + url_liveness: false # HEAD-check every entry.url field (slow) + retraction: true # flag retracted DOIs via CrossRef + +submission: # 11 LaTeX checkers β€” toggle each independently + caption: true + reference: true + formatting: true + equation: true + ai_artifacts: true + sentence: true + consistency: true + acronym: true + number: true + citation_quality: true + anonymization: true + +# Project glossary feeds the consistency / acronym checkers. +glossary: + preferred: + - "Transformer" + - "fine-tuning" + acronyms: + NLP: "Natural Language Processing" + LLM: "Large Language Model" + +llm: + backend: "gemini" # gemini | openai | anthropic | deepseek | ollama | vllm + model: "" # leave empty for sensible default per backend + api_key: "" # PREFER env var: $GEMINI_API_KEY / $OPENAI_API_KEY / etc. + +output: + quiet: false + minimal_verified: false + formats: [markdown, html] # any of: markdown, html, json ``` -## πŸ€– LLM-Based Relevance Check +## πŸ€– LLM-Based Relevance + Role Tagging -To verify citations match their context using AI: +When `bibliography.check_relevance` is `true`, BibGuard sends each citation's surrounding context plus the cited paper's abstract to your chosen LLM. The model returns a 1-5 relevance score, an `is_relevant` boolean, a one-sentence explanation, and a **citation role**: -```yaml -bibliography: - check_relevance: true +- `baseline` β€” cited as a comparison/baseline +- `method` β€” cited paper introduces a method this one builds on +- `dataset` β€” provides a dataset/benchmark used here +- `counterexample` β€” cited to argue against +- `survey` β€” cited as a survey/overview +- `motivation` β€” cited to motivate the problem +- `other` -llm: - backend: "gemini" # Options: gemini, openai, anthropic, deepseek, ollama, vllm - api_key: "" # Or use environment variable (e.g., GEMINI_API_KEY) +**Supported backends**: Gemini, OpenAI, Anthropic, DeepSeek, Ollama (local), vLLM (custom endpoint). + +**API keys**: read from environment variables by convention β€” `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `DEEPSEEK_API_KEY`. Set them in your shell rather than committing `api_key:` to `bibguard.yaml`. + +## 🌐 Web UI + +```bash +python app.py ``` -**Supported Backends:** -- **Gemini** (Google): `GEMINI_API_KEY` -- **OpenAI**: `OPENAI_API_KEY` -- **Anthropic**: `ANTHROPIC_API_KEY` -- **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance) -- **Ollama**: Local models (no API key needed) -- **vLLM**: Custom endpoint +Opens at `http://localhost:7860`. The web UI mirrors the CLI but with a streaming status panel and three presets: + +- **Quick** β€” local checks only, no network, instant +- **Standard** β€” local + retraction lookup (CrossRef) +- **Strict** β€” adds multi-source metadata fetch + URL liveness (slow on first run; subsequent runs are cached) + +The toolbar fits in one row: file uploads, preset chips, and Run / Stop. Per-check overrides live in the **Advanced** accordion. The report renders inline as a self-contained iframe so the page stays stable while entries stream in. Downloads (HTML, Markdown bib, JSON, cleaned `.bib`, `bibguard.log`) appear in the **Downloads** accordion below. + +Set `BIBGUARD_CONTACT_EMAIL=you@example.com` in your shell to use a real contact in the polite-pool User-Agent. + +## πŸͺ Pre-commit Hook + +To run BibGuard automatically before each commit that touches `.tex` or `.bib`: -Then run: ```bash -python main.py +cd /path/to/your-paper-repo +bash /path/to/BibGuard/scripts/install-hook.sh ``` +Skip the hook for one commit with `git commit --no-verify`. + ## πŸ“ Understanding Reports -### Bibliography Report -Shows for each entry: -- βœ… **Verified**: Metadata matches online databases -- ⚠️ **Issues**: Mismatches, missing entries, duplicates -- πŸ“Š **Statistics**: Usage, duplicates, preprint ratio +### Self-Contained HTML (`report.html`) +The recommended output. Single file, no external assets, dark-mode aware. Includes: +- Three tabs: **Bibliography** Β· **LaTeX Quality** Β· **Retractions / URLs** +- **Per-section filter chips** β€” bibliography filters by Verified / Unverified / Unused; LaTeX quality filters by Errors / Warnings / Info +- **Full-text search** across titles, authors, keys, and messages β€” works inside the active tab +- **Inline span highlighting** β€” for LaTeX issues that come from a regex (e.g., `\cite{}` without `~`), the offending substring is wrapped in `` so you can see exactly *where* in the line to look +- **Honest empty states** β€” Retractions / URL liveness panels report how many entries actually carried a `doi=` / `url=` field, so an empty result no longer looks like the check failed silently +- Theme toggle that overrides system preference -### LaTeX Quality Report -Organized by severity: -- πŸ”΄ **Errors**: Critical issues (e.g., undefined references) -- 🟑 **Warnings**: Important issues (e.g., inconsistent spelling) -- πŸ”΅ **Suggestions**: Style improvements (e.g., weak sentence starters) +### Markdown Reports +Two files for granular review and code review tooling: +- `bibliography_report.md` β€” every entry with metadata-match status, including positive **corroboration notes** when a second source agreed +- `latex_quality_report.md` β€” issues grouped by checker and severity, full line content with the offending span bolded -### Line-by-Line Report -All LaTeX issues sorted by line number for easy fixing. +### JSON Output +Machine-readable dump for CI integration. Top-level keys: `meta`, `summary`, `entries`, `submission_results`, `retractions`, `url_findings`, `duplicates`, `missing_citations`. ## 🧐 Understanding Mismatches BibGuard is strict, but false positives happen: -1. **Year Discrepancy (Β±1 Year)**: - - *Reason*: Delay between preprint (arXiv) and official publication - - *Action*: Verify which version you intend to cite - -2. **Author List Variations**: - - *Reason*: Different databases handle large author lists differently - - *Action*: Check if primary authors match - -3. **Venue Name Differences**: - - *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems") - - *Action*: Both are usually correct +1. **Year Discrepancy (Β±1 Year)** β€” preprint vs. official publication. Verify which version you intend to cite. +2. **Author List Variations** β€” different databases truncate large author lists differently. Check primary authors. +3. **Venue Name Differences** β€” abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems"). Both usually correct. +4. **Non-Academic Sources** β€” blogs and documentation aren't indexed by academic databases. Verify URL and title manually. -4. **Non-Academic Sources**: - - *Reason*: Blogs, documentation not indexed by academic databases - - *Action*: Manually verify URL and title +## πŸ”§ Performance Notes -## πŸ”§ Advanced Options - -```bash -python main.py --help # Show all options -python main.py --list-templates # List conference templates -python main.py --config my.yaml # Use custom config file -``` +- **First run** with `check_metadata: true` on ~100 entries: 1-3 minutes (rate-limited by arXiv/CrossRef). +- **Re-runs**: seconds, thanks to the SQLite HTTP cache at `~/.cache/bibguard/http_cache.sqlite` (TTL 24h by default). +- **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers. +- **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold. ## 🀝 Contributing -Contributions welcome! Please open an issue or pull request. +Contributions welcome. Open an issue or pull request. ## πŸ™ Acknowledgments -BibGuard uses multiple data sources: -- arXiv API -- CrossRef API -- Semantic Scholar API -- DBLP API -- OpenAlex API -- Google Scholar (via scholarly) +BibGuard uses the following data sources: +- [arXiv API](https://info.arxiv.org/help/api/index.html) +- [CrossRef REST API](https://api.crossref.org) +- [Semantic Scholar Graph API](https://api.semanticscholar.org) +- [DBLP API](https://dblp.org/faq/How+to+use+the+dblp+search+API.html) +- [OpenAlex API](https://docs.openalex.org) +- Google Scholar (via scraping; rate-limited) --- diff --git a/app.py b/app.py index bb1b2fed83ad55ba15313e633f256ac8d7e26a6c..ca6bbc96b4eb7cdfbef23c55d4bc1fe94ce4d40f 100644 --- a/app.py +++ b/app.py @@ -1,927 +1,1243 @@ #!/usr/bin/env python3 """ -BibGuard Gradio Web Application +BibGuard Gradio web app β€” minimalist iframe layout. -A web interface for checking bibliography and LaTeX quality. +The right pane embeds the self-contained ``report.html`` produced by +``src/report/html_report.py`` via ``' + ) + + +def _status_html(stage: str, detail: str = "", meta: list[str] | None = None, + state: str = "running") -> str: + """Render the live-status strip shown above the report. + + Layout is a single horizontal row: [stage] [detail] [meta chips]. + Wraps cleanly on narrow screens. + """ + if state == "running": + stage_icon = '' + elif state == "done": + stage_icon = 'βœ“' + elif state == "error": + stage_icon = '⚠' + else: + stage_icon = 'β—‹' + detail_html = f'{detail}' if detail else '' + meta_html = "" + if meta: + meta_html = ( + '' + + " ".join(f"{m}" for m in meta) + + "" + ) + return ( + f'
' + f'
' + f'{stage_icon}{stage}' + f'{detail_html}{meta_html}' + f'
' + ) + + +# --------------------------------------------------------------- config glue def create_config_from_ui( - check_metadata: bool, - check_usage: bool, - check_duplicates: bool, - check_preprint_ratio: bool, - caption: bool, - reference: bool, - formatting: bool, - equation: bool, - ai_artifacts: bool, - sentence: bool, - consistency: bool, - acronym: bool, - number: bool, - citation_quality: bool, - anonymization: bool + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization, ) -> BibGuardConfig: - """Create a BibGuardConfig from UI settings.""" config = BibGuardConfig() - config.bibliography = BibliographyConfig( check_metadata=check_metadata, check_usage=check_usage, check_duplicates=check_duplicates, check_preprint_ratio=check_preprint_ratio, - check_relevance=False # Disabled for web + check_relevance=False, # LLM disabled in web mode ) - config.submission = SubmissionConfig( - caption=caption, - reference=reference, - formatting=formatting, - equation=equation, - ai_artifacts=ai_artifacts, - sentence=sentence, - consistency=consistency, - acronym=acronym, - number=number, - citation_quality=citation_quality, - anonymization=anonymization + caption=caption, reference=reference, formatting=formatting, equation=equation, + ai_artifacts=ai_artifacts, sentence=sentence, consistency=consistency, + acronym=acronym, number=number, citation_quality=citation_quality, + anonymization=anonymization, ) - config.output = OutputConfig(quiet=True, minimal_verified=False) - return config -def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str: - """Generate HTML content for bibliography report.""" - html = ['
'] - - # 1. Summary Stats - total = len(entries) - verified = sum(1 for e in report_gen.entries if e.comparison and e.comparison.is_match) - used = sum(1 for e in report_gen.entries if e.usage and e.usage.is_used) - - html.append('
') - html.append(f'
{total}
Total Entries
') - html.append(f'
{verified}
Verified
') - html.append(f'
{used}
Used in Text
') - html.append('
') - - # 2. Entries - for report in report_gen.entries: - entry = report.entry - status_badges = [] - - # Metadata Status - if report.comparison: - if report.comparison.is_match: - status_badges.append('βœ“ Verified') - if report.comparison.source: - status_badges.append(f'{report.comparison.source.upper()}') - else: - status_badges.append('⚠ Metadata Mismatch') - else: - status_badges.append('No Metadata Check') - - # Usage Status - if report.usage: - if report.usage.is_used: - status_badges.append(f'Used: {report.usage.usage_count}x') - else: - status_badges.append('Unused') - - # Build Card - html.append(f''' -
-
-
-

{entry.title or "No Title"}

-
{entry.key} β€’ {entry.year} β€’ {entry.entry_type}
-
-
- {" ".join(status_badges)} -
-
- -
-
- { - (lambda e: "".join([ - f'
{k}
{v}
' - for k, v in filter(None, [ - ("Authors", e.author or "N/A"), - ("Venue", e.journal or e.booktitle or e.publisher or "N/A"), - ("DOI", e.doi) if e.doi else None, - ("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None, - ("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None, - ("URL", f'Link') if e.url else None - ]) - ]))(entry) - } -
- ''') - - # Add issues if any - issues = [] - if report.comparison and not report.comparison.is_match: - # Add main message derived from match status - if report.comparison.issues: - for issue in report.comparison.issues: - issues.append(f'
β€’ {issue}
') - else: - issues.append(f'
β€’ Verification failed
') - - if issues: - html.append('
') - html.append("".join(issues)) - html.append('
') - - html.append('
') # Close card-content and report-card - - html.append('
') # Close container - return "".join(html) - -def generate_latex_html(results: list) -> str: - """Generate HTML for LaTeX quality check.""" - from src.checkers import CheckSeverity - - html = ['
'] - - # Stats - errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR) - warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING) - infos = sum(1 for r in results if r.severity == CheckSeverity.INFO) - - html.append('
') - html.append(f'
{errors}
Errors
') - html.append(f'
{warnings}
Warnings
') - html.append(f'
{infos}
Suggestions
') - html.append('
') - - if not results: - html.append('
βœ… No issues found in LaTeX code!
') - else: - # Group by Checker - results.sort(key=lambda x: x.checker_name) - current_checker = None - - for result in results: - badge_class = "badge-neutral" - if result.severity == CheckSeverity.ERROR: badge_class = "badge-error" - elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning" - elif result.severity == CheckSeverity.INFO: badge_class = "badge-info" - - html.append(f''' -
-
-
-

{result.checker_name}

-
Line {result.line_number}
-
- {result.severity.name} -
-
- {result.message} - {f'
{result.line_content}
' if result.line_content else ''} - {f'
πŸ’‘ Suggestion: {result.suggestion}
' if result.suggestion else ''} -
-
- ''') - - html.append('
') - return "".join(html) - -def generate_line_html(content: str, results: list) -> str: - """Generate HTML for Line-by-Line report.""" - # Build a dictionary of line_number -> list of issues - issues_by_line = {} - for r in results: - if r.line_number not in issues_by_line: - issues_by_line[r.line_number] = [] - issues_by_line[r.line_number].append(r) - - lines = content.split('\n') - - html = ['
'] - - html.append('
Issues are mapped to specific lines below.
') - - for i, line in enumerate(lines, 1): - if i in issues_by_line: - # Highlight this line - line_issues = issues_by_line[i] - - html.append(f''' -
-
Line {i}
-
{line}
-
- ''') - - for issue in line_issues: - html.append(f'
β€’ {issue.message}
') - - html.append('
') - - html.append('
') - return "".join(html) +def apply_preset(name: str): + p = PRESETS.get(name, PRESETS["Standard"]) + sub = p["submission"] + return ( + p["check_metadata"], p["check_usage"], p["check_duplicates"], p["check_preprint_ratio"], + sub["caption"], sub["reference"], sub["formatting"], sub["equation"], + sub["ai_artifacts"], sub["sentence"], sub["consistency"], sub["acronym"], + sub["number"], sub["citation_quality"], sub["anonymization"], + p["url_liveness"], p["retraction"], + ) +_PRESET_CAPTIONS = { + "Quick": "local checks only Β· no network Β· instant", + "Standard": "local checks + retraction lookup (CrossRef)", + "Strict": "+ URL liveness + multi-source metadata (slow)", +} + + +def _preset_caption_html(name: str) -> str: + text = _PRESET_CAPTIONS.get(name, "") + return f'
{text}
' +# ------------------------------------------------------------------ run_check +# Streaming generator. Each yield is a 7-tuple: +# (iframe_html, status_html, html_path, md_path, json_path, +# cleaned_bib_path, log_path) +# `capture_run` attaches a per-run DEBUG file handler so any exception or +# warning anywhere in the pipeline is recorded with full traceback at +# `/bibguard.log`, which is then downloadable. The status panel +# surfaces warning+error counts so problems aren't invisible. + def run_check( - bib_file, - tex_file, - check_metadata: bool, - check_usage: bool, - check_duplicates: bool, - check_preprint_ratio: bool, - caption: bool, - reference: bool, - formatting: bool, - equation: bool, - ai_artifacts: bool, - sentence: bool, - consistency: bool, - acronym: bool, - number: bool, - citation_quality: bool, - anonymization: bool, - progress=gr.Progress() -) -> Tuple[str, str, str]: - """Run BibGuard checks and return three reports.""" - - if bib_file is None or tex_file is None: - return ( - "⚠️ Please upload both `.bib` and `.tex` files.", - "⚠️ Please upload both `.bib` and `.tex` files.", - "⚠️ Please upload both `.bib` and `.tex` files." + bib_file, tex_file, + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization, + url_liveness=False, retraction=True, +): + """Run the full check pipeline as a streaming generator with per-run logging. + + `bib_file` / `tex_file` are filesystem path strings (carried by gr.State), + not gr.File objects. The status panel is the single source of progress + feedback β€” no separate gr.Progress bar. + """ + started = time.time() + + def _elapsed() -> str: + return f"⏱ {int(time.time() - started)}s" + + # Initial state: keep current report (None means clear). + if not bib_file or not tex_file: + yield ( + _placeholder("Please choose both a .bib and a .tex file in the toolbar."), + _status_html("Waiting for files", + "Pick a .bib and a .tex file from the toolbar to start.", + state="error"), + None, None, None, None, None, ) - - try: - # Create config from UI - config = create_config_from_ui( - check_metadata, check_usage, check_duplicates, check_preprint_ratio, - caption, reference, formatting, equation, ai_artifacts, - sentence, consistency, acronym, number, citation_quality, anonymization + return + + # Allocate the artifact dir up-front so the per-run log lives next to + # the report files. + out_dir = Path(tempfile.mkdtemp(prefix="bibguard_")) + log_path_target = out_dir / "bibguard.log" + + # Reset per-source circuit breakers so a previous run's flaky source + # doesn't carry over and skip valid lookups in this run. + http_layer.reset_breakers() + + with capture_run(target_path=log_path_target) as (log_path, log_stats): + logger.info("=== run_check start: bib=%s tex=%s ===", bib_file, tex_file) + try: + yield from _run_check_impl( + bib_file, tex_file, out_dir, log_path, log_stats, + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization, + url_liveness, retraction, started, _elapsed, + ) + except Exception as e: + logger.exception("run_check crashed (entry-level guard)") + yield ( + _placeholder(f"Unhandled error: {e}"), + _status_html("Failed", f"{e} β€” see bibguard.log for the full traceback.", + state="error"), + None, None, None, None, str(log_path), + ) + finally: + logger.info("=== run_check end: warnings=%d errors=%d ===", + log_stats.warnings, log_stats.errors) + + +def _run_check_impl( + bib_file, tex_file, out_dir, log_path, log_stats, + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization, + url_liveness, retraction, started, _elapsed, +): + """Inner pipeline. Wrapped in `capture_run` by `run_check`. + + Every yield is a 7-tuple ending with the log path so the user can + download `bibguard.log` even from intermediate updates. + """ + log_path_str = str(log_path) + + bib_path = Path(bib_file) + tex_path = Path(tex_file) + logger.info("Inputs: bib=%s tex=%s out_dir=%s", bib_path, tex_path, out_dir) + + def _meta_with_logs(extra: list[str]) -> list[str]: + out = list(extra) + if log_stats.warnings or log_stats.errors: + out.append(f"⚠ {log_stats.warnings}w / {log_stats.errors}e logged") + return out + + yield ( + gr.update(), + _status_html("Validating files", + f"Reading {bib_path.name} and {tex_path.name}", + meta=_meta_with_logs([_elapsed()])), + None, None, None, None, log_path_str, + ) + + # Pre-flight content validation + bib_rep = validate_bib(bib_path) + tex_rep = validate_tex(tex_path) + msg = "\n".join(filter(None, [ + format_report(bib_rep, bib_path.name), + format_report(tex_rep, tex_path.name), + ])) + if not bib_rep.ok or not tex_rep.ok: + logger.error("File validation failed:\n%s", msg) + block = ( + f'
' + f'
⚠️
' + f'
File validation failed
' + f'
{msg}
' + f'
' ) - - # Get file paths from uploaded files - bib_path = bib_file.name - tex_path = tex_file.name - - # Read tex content for checkers - tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace') - - # Parse files - bib_parser = BibParser() - entries = bib_parser.parse_file(bib_path) - - tex_parser = TexParser() - tex_parser.parse_file(tex_path) - - bib_config = config.bibliography - - # Initialize components - arxiv_fetcher = None - crossref_fetcher = None - semantic_scholar_fetcher = None - openalex_fetcher = None - dblp_fetcher = None - comparator = None - usage_checker = None - duplicate_detector = None - - if bib_config.check_metadata: - arxiv_fetcher = ArxivFetcher() - semantic_scholar_fetcher = SemanticScholarFetcher() - openalex_fetcher = OpenAlexFetcher() - dblp_fetcher = DBLPFetcher() - crossref_fetcher = CrossRefFetcher() - comparator = MetadataComparator() - - if bib_config.check_usage: - usage_checker = UsageChecker(tex_parser) - - if bib_config.check_duplicates: - duplicate_detector = DuplicateDetector() - - # Initialize report generator - report_gen = ReportGenerator( - minimal_verified=False, - check_preprint_ratio=bib_config.check_preprint_ratio, - preprint_warning_threshold=bib_config.preprint_warning_threshold + yield ( + block, + _status_html("File validation failed", msg.replace("\n", "
"), + state="error"), + None, None, None, None, log_path_str, ) - report_gen.set_metadata([bib_file.name], [tex_file.name]) - - # Run submission quality checks - progress(0.2, desc="Running LaTeX quality checks...") - submission_results = [] - enabled_checkers = config.submission.get_enabled_checkers() - - for checker_name in enabled_checkers: - if checker_name in CHECKER_REGISTRY: - checker = CHECKER_REGISTRY[checker_name]() + return + elif msg: + logger.info("Validation warnings:\n%s", msg) + + config = create_config_from_ui( + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, ai_artifacts, + sentence, consistency, acronym, number, citation_quality, anonymization, + ) + + yield ( + gr.update(), + _status_html("Parsing", "Loading bibliography and LaTeX source", + meta=_meta_with_logs([_elapsed()])), + None, None, None, None, log_path_str, + ) + + tex_content = tex_path.read_text(encoding='utf-8', errors='replace') + bib_parser = BibParser() + entries = bib_parser.parse_file(str(bib_path)) + tex_parser = TexParser() + tex_parser.parse_file(str(tex_path)) + logger.info("Parsed %d bib entries from %s", len(entries), bib_path.name) + + bib_config = config.bibliography + + # Init components + arxiv_fetcher = crossref_fetcher = ss_fetcher = oa_fetcher = dblp_fetcher = None + comparator = usage_checker = duplicate_detector = None + + if bib_config.check_metadata: + arxiv_fetcher = ArxivFetcher() + ss_fetcher = SemanticScholarFetcher() + oa_fetcher = OpenAlexFetcher() + dblp_fetcher = DBLPFetcher() + crossref_fetcher = CrossRefFetcher() + comparator = MetadataComparator() + if bib_config.check_usage: + usage_checker = UsageChecker(tex_parser) + if bib_config.check_duplicates: + duplicate_detector = DuplicateDetector() + + report_gen = ReportGenerator( + minimal_verified=False, + check_preprint_ratio=bib_config.check_preprint_ratio, + preprint_warning_threshold=bib_config.preprint_warning_threshold, + ) + report_gen.set_metadata([str(bib_path)], [str(tex_path)]) + + # Submission quality checks + yield ( + gr.update(), + _status_html("LaTeX quality checks", + f"Running {len(config.submission.get_enabled_checkers())} checkers on the LaTeX source", + meta=_meta_with_logs([f"πŸ“š {len(entries)} bib entries", _elapsed()])), + None, None, None, None, log_path_str, + ) + submission_results = [] + for name in config.submission.get_enabled_checkers(): + if name in CHECKER_REGISTRY: + try: + checker = CHECKER_REGISTRY[name]() results = checker.check(tex_content, {}) for r in results: - r.file_path = tex_file.name + r.file_path = str(tex_path) submission_results.extend(results) - - report_gen.set_submission_results(submission_results, None) - - # Check for duplicates - if bib_config.check_duplicates and duplicate_detector: - duplicate_groups = duplicate_detector.find_duplicates(entries) - report_gen.set_duplicate_groups(duplicate_groups) - - # Check missing citations - if bib_config.check_usage and usage_checker: - missing = usage_checker.get_missing_entries(entries) - report_gen.set_missing_citations(missing) - - # Build workflow - workflow_config = get_default_workflow() - - # Process entries - progress(0.3, desc="Processing bibliography entries...") - total_entries = len(entries) - - for i, entry in enumerate(entries): - progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}") - - # Check usage - usage_result = None + except Exception: + logger.exception("Checker %s crashed", name) + report_gen.set_submission_results(submission_results, None) + + if bib_config.check_duplicates and duplicate_detector: + try: + report_gen.set_duplicate_groups(duplicate_detector.find_duplicates(entries)) + except Exception: + logger.exception("Duplicate detection crashed") + if bib_config.check_usage and usage_checker: + try: + report_gen.set_missing_citations(usage_checker.get_missing_entries(entries)) + except Exception: + logger.exception("Missing-citation lookup crashed") + + # Per-entry workflow + total = max(1, len(entries)) + workflow_config = get_default_workflow() + verified_count = 0 + flagged_count = 0 + not_found_count = 0 + last_yield = time.time() + + def _identifier_chip(entry) -> str: + """Tiny inline hint about which IDs we have for this entry.""" + bits = [] + if entry.doi: bits.append("DOI") + if entry.has_arxiv: bits.append("arXiv") + if entry.title and not bits: bits.append("title") + elif entry.title: bits.append("title") + return " + ".join(bits) if bits else "no identifiers" + + def _outcome_label(cmp) -> str: + if cmp is None: + return "" + if cmp.source == "unable": + return "? no metadata" + if cmp.is_match: + return f"βœ“ verified by {cmp.source}" + return f"⚠ flagged ({cmp.source})" + + for i, entry in enumerate(entries): + # ── Pre-fetch status: announce identifier set BEFORE the network roundtrip + # so the user sees what's being attempted, not just the entry name. + if bib_config.check_metadata and comparator: + now = time.time() + if now - last_yield > 0.4 or i == 0: + ids = _identifier_chip(entry) + detail = f"{entry.key} Β· querying via {ids}" + if entry.title: + short = entry.title[:70] + ("…" if len(entry.title) > 70 else "") + detail += f" β€” {short}" + yield ( + gr.update(), + _status_html( + f"Verifying entry {i + 1}/{total}", + detail, + meta=_meta_with_logs([ + f"πŸ“š {total} total", + f"βœ“ {verified_count}", + f"⚠ {flagged_count}", + f"? {not_found_count}", + _elapsed(), + ]), + ), + None, None, None, None, log_path_str, + ) + last_yield = now + + usage_result = None + comparison_result = None + try: if usage_checker: usage_result = usage_checker.check_usage(entry) - - # Fetch and compare metadata - comparison_result = None + except Exception: + logger.exception("Usage check crashed for entry=%s", entry.key) + try: if bib_config.check_metadata and comparator: comparison_result = fetch_and_compare_with_workflow( entry, workflow_config, arxiv_fetcher, crossref_fetcher, - semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator + ss_fetcher, oa_fetcher, dblp_fetcher, comparator, ) - - # Create entry report - entry_report = EntryReport( - entry=entry, - comparison=comparison_result, - usage=usage_result, - evaluations=[] + if comparison_result is None or comparison_result.source == "unable": + not_found_count += 1 + elif comparison_result.is_match: + verified_count += 1 + else: + flagged_count += 1 + except Exception: + logger.exception("Metadata fetch crashed for entry=%s", entry.key) + report_gen.add_entry_report(EntryReport( + entry=entry, comparison=comparison_result, + usage=usage_result, evaluations=[], + )) + + # ── Post-fetch status: show outcome inline so the user can watch + # results stream in (verified / flagged / not found). + now = time.time() + if now - last_yield > 0.4 or i == total - 1: + outcome = _outcome_label(comparison_result) + detail_parts = [f"{entry.key}"] + if outcome: + detail_parts.append(outcome) + if entry.title: + short = entry.title[:70] + ("…" if len(entry.title) > 70 else "") + detail_parts.append(f"{short}") + detail = " Β· ".join(detail_parts) + meta = _meta_with_logs([ + f"πŸ“š {i + 1}/{total}", + f"βœ“ {verified_count}", + f"⚠ {flagged_count}", + f"? {not_found_count}", + _elapsed(), + ]) + yield ( + gr.update(), + _status_html(f"Bibliography {i + 1}/{total}", detail, meta=meta), + None, None, None, None, log_path_str, ) - report_gen.add_entry_report(entry_report) - - progress(0.85, desc="Generating structured reports...") - - # Generate Bibliography HTML Report - bib_report = generate_bibliography_html(report_gen, entries) - - # Generate LaTeX Quality HTML Report - latex_report = generate_latex_html(submission_results) - - # Generate Line-by-Line HTML Report - line_report = "" - if submission_results: - line_report = generate_line_html(tex_content, submission_results) - else: - line_report = '
No issues to display line-by-line.
' - - progress(1.0, desc="Done!") - - return bib_report, latex_report, line_report - - except Exception as e: - error_msg = f"❌ Error: {str(e)}" - import traceback - error_msg += f"\n\n```\n{traceback.format_exc()}\n```" - return error_msg, error_msg, error_msg + last_yield = now + + if retraction: + try: + doi_count = sum(1 for e in entries if getattr(e, "doi", "")) + yield ( + gr.update(), + _status_html("Retraction lookups", + f"Querying CrossRef for {doi_count} DOI(s)", + meta=_meta_with_logs([_elapsed()])), + None, None, None, None, log_path_str, + ) + report_gen.set_retraction_findings(RetractionChecker().check_entries(entries)) + except Exception: + logger.exception("Retraction lookup crashed") + + if url_liveness: + try: + url_count = sum(1 for e in entries if getattr(e, "url", "")) + yield ( + gr.update(), + _status_html("URL liveness", + f"HEAD-checking {url_count} URL(s) in parallel", + meta=_meta_with_logs([_elapsed()])), + None, None, None, None, log_path_str, + ) + report_gen.set_url_findings(URLChecker().check_entries(entries)) + except Exception: + logger.exception("URL liveness crashed") + + # Save artifacts + yield ( + gr.update(), + _status_html("Building report", + "Rendering self-contained HTML, JSON, and Markdown", + meta=_meta_with_logs([_elapsed()])), + None, None, None, None, log_path_str, + ) + html_path = out_dir / "report.html" + md_path = out_dir / "bibliography_report.md" + json_path = out_dir / "report.json" + cleaned_bib_path: Path | None = None + try: + report_gen.save_html(str(html_path)) + report_gen.save_bibliography_report(str(md_path)) + report_gen.save_json(str(json_path)) + if usage_checker: + used_keys = {er.entry.key for er in report_gen.entries if er.usage and er.usage.is_used} + if used_keys: + cleaned_bib_path = out_dir / f"{bib_path.stem}_only_used.bib" + bib_parser.filter_file(str(bib_path), str(cleaned_bib_path), used_keys) + except Exception: + logger.exception("Artifact generation failed") + # Embed report.html as iframe srcdoc + if html_path.exists(): + iframe_html = _html_to_iframe(html_path.read_text(encoding='utf-8')) + else: + iframe_html = _placeholder("Report generation failed β€” see bibguard.log.") + + meta = _meta_with_logs([ + f"πŸ“š {len(entries)} entries", + f"βœ“ {verified_count} verified", + f"⚠ {flagged_count} flagged", + _elapsed(), + ]) + state = "done" + summary = "Report ready. Use the right pane to filter, search, and copy fixes." + if log_stats.errors > 0: + state = "error" + summary = (f"Done with {log_stats.errors} error(s) and {log_stats.warnings} warning(s) " + "logged β€” see bibguard.log for full tracebacks.") + elif log_stats.warnings > 0: + summary = (f"Report ready ({log_stats.warnings} warnings logged β€” see " + "bibguard.log).") + + yield ( + iframe_html, + _status_html("Done", summary, meta=meta, state=state), + str(html_path) if html_path.exists() else None, + str(md_path) if md_path.exists() else None, + str(json_path) if json_path.exists() else None, + str(cleaned_bib_path) if (cleaned_bib_path and cleaned_bib_path.exists()) else None, + log_path_str, + ) + + +# --------------------------------------------------------------------- layout -def create_app(): - """Create and configure the Gradio app.""" - - # Load icon as base64 - icon_html = "" +def create_app() -> gr.Blocks: + # Inline app icon as a base64 data URL β€” works regardless of cwd. + icon_html = 'πŸ›‘οΈ' try: - icon_path = Path("assets/icon-192.png") + icon_path = Path(__file__).parent / "assets" / "icon-192.png" if icon_path.exists(): with open(icon_path, "rb") as f: - encoding = base64.b64encode(f.read()).decode() - icon_html = f'BibGuard' - else: - icon_html = 'πŸ“š' - except Exception: - icon_html = 'πŸ“š' - - with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app: - - # Header with icon - with gr.Row(elem_classes=["app-header"]): - gr.HTML(f""" -
- {icon_html} -
-

BibGuard

-

Bibliography & LaTeX Quality Checker

-
-
- """) - - with gr.Row(elem_classes=["app-body"]): - # Left column: Upload & Settings - with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]): - gr.Markdown("### πŸ“ Upload Files") - - bib_file = gr.File( - label="Bibliography (.bib)", - file_types=[".bib"], - file_count="single" + b64 = base64.b64encode(f.read()).decode() + icon_html = ( + f'BibGuard' + ) + except Exception as e: + logger.debug("Icon load failed; using emoji fallback: %s", e, exc_info=True) + + with gr.Blocks( + title="BibGuard β€” Bibliography & LaTeX Quality Auditor", + ) as app: + + gr.HTML(f""" +
+ {icon_html} + BibGuard + β€” Bibliography & LaTeX quality auditor + + GitHub β†— +
+ """) + + # ───────────────────────── Top toolbar ───────────────────────── + # All primary controls on a single horizontal row, every primary + # widget pinned to 56px height. gr.UploadButton replaces gr.File + # because the latter's drop-zone doesn't shrink to a toolbar. + with gr.Row(elem_classes=["bg-toolbar"]): + with gr.Column(scale=2, min_width=200): + bib_btn = gr.UploadButton( + "πŸ“š Choose .bib file", + file_types=[".bib"], file_count="single", + elem_classes=["bg-upload-btn"], + ) + bib_status = gr.HTML('
no file selected
') + with gr.Column(scale=2, min_width=200): + tex_btn = gr.UploadButton( + "πŸ“„ Choose .tex file", + file_types=[".tex"], file_count="single", + elem_classes=["bg-upload-btn"], ) - - tex_file = gr.File( - label="LaTeX Source (.tex)", - file_types=[".tex"], - file_count="single" + tex_status = gr.HTML('
no file selected
') + with gr.Column(scale=3, min_width=280): + preset = gr.Radio( + choices=list(PRESETS.keys()), + value="Standard", + show_label=False, + elem_classes=["bg-preset"], ) - - # Check options in grid layout - gr.Markdown("#### βš™οΈ Options") - - with gr.Row(): - check_metadata = gr.Checkbox(label="πŸ” Metadata", value=False) - check_usage = gr.Checkbox(label="πŸ“Š Usage", value=True) - - with gr.Row(): - check_duplicates = gr.Checkbox(label="πŸ‘― Duplicates", value=True) - check_preprint_ratio = gr.Checkbox(label="πŸ“„ Preprints", value=True) - - with gr.Row(): - caption = gr.Checkbox(label="πŸ–ΌοΈ Captions", value=True) - reference = gr.Checkbox(label="πŸ”— References", value=True) - - with gr.Row(): - formatting = gr.Checkbox(label="✨ Formatting", value=True) - equation = gr.Checkbox(label="πŸ”’ Equations", value=True) - - with gr.Row(): - ai_artifacts = gr.Checkbox(label="πŸ€– AI Artifacts", value=True) - sentence = gr.Checkbox(label="πŸ“ Sentences", value=True) - - with gr.Row(): - consistency = gr.Checkbox(label="πŸ”„ Consistency", value=True) - acronym = gr.Checkbox(label="πŸ”€ Acronyms", value=True) - - with gr.Row(): - number = gr.Checkbox(label="πŸ”’ Numbers", value=True) - citation_quality = gr.Checkbox(label="πŸ“š Citations", value=True) - - with gr.Row(): - anonymization = gr.Checkbox(label="🎭 Anonymization", value=True) - - run_btn = gr.Button("πŸ” Check Now", variant="primary", size="lg") - - gr.HTML(""" -
- - - GitHub - -

Developed with ❀️ for researchers

-
- """) - - # Right column: Reports - with gr.Column(scale=4, elem_classes=["app-content"]): - with gr.Tabs(): - with gr.Tab("πŸ“š Bibliography Report"): - bib_report = gr.HTML( - value=WELCOME_HTML, - elem_classes=["report-panel"] - ) - - with gr.Tab("πŸ“ LaTeX Quality"): - latex_report = gr.HTML( - value=WELCOME_HTML, - elem_classes=["report-panel"] - ) - - with gr.Tab("πŸ“‹ Line-by-Line"): - line_report = gr.HTML( - value=WELCOME_HTML, - elem_classes=["report-panel"] - ) - - # Event handling - run_btn.click( + preset_caption = gr.HTML( + _preset_caption_html("Standard"), + ) + with gr.Column(scale=1, min_width=140): + run_btn = gr.Button("β–Ά Run check", variant="primary", + elem_classes=["bg-run-btn"]) + stop_btn = gr.Button("β—Ό Stop", variant="stop", + elem_classes=["bg-run-btn", "bg-stop-btn"], + visible=False) + gr.HTML('
 
') + + # Holds the selected file paths (strings). Updated by the UploadButton + # callbacks below so run_check sees plain paths regardless of how the + # user picked the files. + bib_path_state = gr.State(value=None) + tex_path_state = gr.State(value=None) + + # Advanced fine-grained toggles. Default closed β€” most users just + # pick a preset and go. Each tab is composed of gr.Row blocks of + # exactly 4 cells so columns line up vertically. Short rows are + # padded with invisible spacer HTML. + def _spacer(): + return gr.HTML('
 
', + elem_classes=["bg-row-spacer"]) + + with gr.Accordion("βš™οΈ Advanced settings", open=False): + with gr.Tabs(): + with gr.TabItem("Bibliography"): + with gr.Row(elem_classes=["bg-row"]): + check_metadata = gr.Checkbox(label="Metadata verify", value=False) + check_usage = gr.Checkbox(label="Usage", value=True) + check_duplicates = gr.Checkbox(label="Duplicates", value=True) + check_preprint_ratio = gr.Checkbox(label="Preprints", value=True) + with gr.Row(elem_classes=["bg-row"]): + retraction = gr.Checkbox(label="Retractions", value=True) + url_liveness = gr.Checkbox(label="URL liveness", value=False) + _spacer() + _spacer() + + with gr.TabItem("LaTeX format"): + with gr.Row(elem_classes=["bg-row"]): + caption = gr.Checkbox(label="Captions", value=True) + reference = gr.Checkbox(label="References", value=True) + formatting = gr.Checkbox(label="Formatting", value=True) + equation = gr.Checkbox(label="Equations", value=True) + + with gr.TabItem("Writing"): + with gr.Row(elem_classes=["bg-row"]): + ai_artifacts = gr.Checkbox(label="AI artifacts", value=True) + sentence = gr.Checkbox(label="Sentences", value=True) + consistency = gr.Checkbox(label="Consistency", value=True) + acronym = gr.Checkbox(label="Acronyms", value=True) + with gr.Row(elem_classes=["bg-row"]): + number = gr.Checkbox(label="Numbers", value=True) + citation_quality = gr.Checkbox(label="Citations", value=True) + anonymization = gr.Checkbox(label="Anonymization", value=True) + _spacer() + + # ───────────────────────── Status strip ───────────────────────── + status_panel = gr.HTML(value=EMPTY_STATUS_HTML, elem_id="bg-status-wrap") + + # ───────────────────────── Report (full width) ─────────────────── + with gr.Row(elem_classes=["bg-main"]): + report_panel = gr.HTML(value=EMPTY_PANEL_HTML) + + # ───────────────────────── Downloads ──────────────────────────── + with gr.Accordion("πŸ“₯ Downloads", open=False): + with gr.Row(elem_classes=["bg-downloads"]): + download_html = gr.File(label="report.html (offline)", + interactive=False, elem_classes=["bg-file-input"]) + download_md = gr.File(label="bibliography_report.md", + interactive=False, elem_classes=["bg-file-input"]) + download_json = gr.File(label="report.json", + interactive=False, elem_classes=["bg-file-input"]) + download_bib = gr.File(label="cleaned .bib", + interactive=False, elem_classes=["bg-file-input"]) + download_log = gr.File(label="bibguard.log", + interactive=False, elem_classes=["bg-file-input"]) + + gr.HTML( + '' + ) + + preset.change( + fn=apply_preset, + inputs=[preset], + outputs=[ + check_metadata, check_usage, check_duplicates, check_preprint_ratio, + caption, reference, formatting, equation, + ai_artifacts, sentence, consistency, acronym, + number, citation_quality, anonymization, + url_liveness, retraction, + ], + ) + preset.change( + fn=_preset_caption_html, + inputs=[preset], + outputs=[preset_caption], + ) + + # ---- Upload-button callbacks: store path in state + update chip ---- + + def _on_bib_upload(f): + if f is None: + return None, '
no file selected
' + path = getattr(f, "name", str(f)) + return path, f'
πŸ“š {Path(path).name}
' + + def _on_tex_upload(f): + if f is None: + return None, '
no file selected
' + path = getattr(f, "name", str(f)) + return path, f'
πŸ“„ {Path(path).name}
' + + bib_btn.upload(_on_bib_upload, inputs=[bib_btn], outputs=[bib_path_state, bib_status]) + tex_btn.upload(_on_tex_upload, inputs=[tex_btn], outputs=[tex_path_state, tex_status]) + + # Run pipeline: + # 1. Toggle visibility: hide Run, show Stop. + # 2. Stream run_check yields into report + status + downloads. + # 3. After completion, swap buttons back. + # Stop button cancels the streaming task via Gradio's `cancels=`. + def _show_stop(): + return gr.update(visible=False), gr.update(visible=True) + + def _show_run(): + return gr.update(visible=True), gr.update(visible=False) + + run_event = run_btn.click( + fn=_show_stop, inputs=None, outputs=[run_btn, stop_btn], + ).then( fn=run_check, inputs=[ - bib_file, tex_file, + bib_path_state, tex_path_state, check_metadata, check_usage, check_duplicates, check_preprint_ratio, caption, reference, formatting, equation, ai_artifacts, - sentence, consistency, acronym, number, citation_quality, anonymization + sentence, consistency, acronym, number, citation_quality, anonymization, + url_liveness, retraction, ], - outputs=[bib_report, latex_report, line_report] + outputs=[report_panel, status_panel, + download_html, download_md, download_json, download_bib, download_log], + ).then( + fn=_show_run, inputs=None, outputs=[run_btn, stop_btn], + ) + + stop_btn.click( + fn=lambda: ( + gr.update(visible=True), + gr.update(visible=False), + _status_html("Cancelled", + "Run interrupted by user. Partial results discarded.", + state="error"), + ), + inputs=None, + outputs=[run_btn, stop_btn, status_panel], + cancels=[run_event], ) - + return app -# Create the app app = create_app() + if __name__ == "__main__": + _favicon = Path(__file__).parent / "assets" / "icon-192.png" app.launch( - favicon_path="assets/icon-192.png", + favicon_path=str(_favicon) if _favicon.exists() else None, show_error=True, css=CUSTOM_CSS, - theme=gr.themes.Soft() + theme=gr.themes.Soft(), ) diff --git a/app_helper.py b/app_helper.py index 4780a3d5533081b245c538f8394b23192786ad2b..e486a24d10c917f2f6b679afd948b597b0d4e874 100644 --- a/app_helper.py +++ b/app_helper.py @@ -1,98 +1,307 @@ +""" +Per-entry metadata verification: parallel multi-source lookup with corroboration. + +Strategy (in order): + 1. **Identifier lookups, in parallel**: + - DOI β†’ CrossRef, Semantic Scholar, OpenAlex + - arXiv ID β†’ arXiv, Semantic Scholar + If the bib entry has either, this stage usually returns 2-3 independent + hits within a few hundred ms. Identifier lookups are far more reliable + than title search because the identifier is unique. + + 2. **Title searches across sources, in parallel** (always run as corroboration, + even if identifiers were found): Semantic Scholar, OpenAlex, DBLP, CrossRef, + arXiv. Each source returns top-K candidates; we keep the candidate whose + title most closely matches the bib title. + + 3. **Score & corroborate**: + - Pick the result with the highest per-source confidence. + - If β‰₯2 sources independently report the same title (sim β‰₯ 0.95) we + mark `is_match=True` even when individual confidences are middling + β€” multi-source agreement is the single strongest signal. + - Tightened thresholds: title sim β‰₯ 0.88 + year diff ≀ 1 (or year empty) + to declare a single-source match. Single-source matches that disagree + with corroborating sources are downgraded. + +The function still returns a single ComparisonResult so the rest of the +pipeline doesn't change. Extra evidence (sources tried, agreement count) is +stuffed into the `issues` field as informational notes when relevant. +""" +from __future__ import annotations + +import concurrent.futures as cf +import logging +from typing import List, Optional, Tuple + +from src.utils.normalizer import TextNormalizer + +logger = logging.getLogger(__name__) + +# Year tolerance for "match" (preprint vs published often differ by 1y). +_YEAR_TOL = 1 +# Title similarity required for single-source match. +_TITLE_MATCH_TIGHT = 0.88 +# Title similarity required to count as "corroborating" another source. +_TITLE_AGREE = 0.95 + + +def _title_sim(a: str, b: str) -> float: + if not a or not b: + return 0.0 + a_n = TextNormalizer.normalize_for_comparison(a) + b_n = TextNormalizer.normalize_for_comparison(b) + if not a_n or not b_n: + return 0.0 + jacc = TextNormalizer.similarity_ratio(a_n, b_n) + if max(len(a_n), len(b_n)) < 200: + lev = TextNormalizer.levenshtein_similarity(a_n, b_n) + return max(jacc, lev) + return jacc + + +def _year_close(y1: str, y2: str) -> bool: + """True if years are missing on either side or within Β±1.""" + y1, y2 = (y1 or "").strip(), (y2 or "").strip() + if not y1 or not y2: + return True + try: + return abs(int(y1[:4]) - int(y2[:4])) <= _YEAR_TOL + except ValueError: + return False + + +def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]: + """Pick the candidate whose title most closely matches `bib_title`.""" + best, best_sim = None, 0.0 + for c in candidates: + sim = _title_sim(bib_title, getattr(c, "title", "") or "") + if sim > best_sim: + best, best_sim = c, sim + return best, best_sim + + def fetch_and_compare_with_workflow( - entry, workflow_steps, arxiv_fetcher, crossref_fetcher, - semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator + entry, + workflow_steps, # accepted for API compat; ignored β€” strategy is fixed + arxiv_fetcher, + crossref_fetcher, + semantic_scholar_fetcher, + openalex_fetcher, + dblp_fetcher, + comparator, ): - """Fetch metadata from online sources using the configured workflow.""" - from src.utils.normalizer import TextNormalizer - - best_result = None - - # If no steps provided, use default order - if not workflow_steps: - # Create a default list of steps if needed, or simply handle logic here - pass - - # Simplified workflow execution: Run through enabled steps - # We manualy iterate through sources in a preferred order if workflow is not fully configured - # Or iterate through the steps list. - - # Since extracting WorkflowConfig logic is complex, let's just implement a robust - # default search strategy here which is what the user likely wants. - - results = [] - - # 1. DBLP (High quality for CS) - if dblp_fetcher and entry.title: - try: - dblp_result = dblp_fetcher.search_by_title(entry.title) - if dblp_result: - res = comparator.compare_with_dblp(entry, dblp_result) - if res.is_match: return res - results.append(res) - except Exception: pass - - # 2. Semantic Scholar (Comprehensive) - if semantic_scholar_fetcher and entry.title: - try: - ss_result = None - if entry.doi: - ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi) - if not ss_result: - ss_result = semantic_scholar_fetcher.search_by_title(entry.title) - - if ss_result: - res = comparator.compare_with_semantic_scholar(entry, ss_result) - if res.is_match: return res - results.append(res) - except Exception: pass - - # 3. OpenAlex - if openalex_fetcher and entry.title: - try: - oa_result = None - if entry.doi: - oa_result = openalex_fetcher.fetch_by_doi(entry.doi) - if not oa_result: - oa_result = openalex_fetcher.search_by_title(entry.title) - - if oa_result: - res = comparator.compare_with_openalex(entry, oa_result) - if res.is_match: return res - results.append(res) - except Exception: pass - - # 4. CrossRef (Official metadata) - if crossref_fetcher and entry.doi: - try: - crossref_result = crossref_fetcher.search_by_doi(entry.doi) - if crossref_result: - res = comparator.compare_with_crossref(entry, crossref_result) - if res.is_match: return res - results.append(res) - except Exception: pass - - # 5. ArXiv - if arxiv_fetcher: - try: - arxiv_meta = None - if entry.has_arxiv: - arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id) - elif entry.title: - # Search by title - search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1) - if search_results: - arxiv_meta = search_results[0] - - if arxiv_meta: - res = comparator.compare_with_arxiv(entry, arxiv_meta) - if res.is_match: return res - results.append(res) - except Exception: pass - - # Return the best result (highest confidence) if no perfect match found - if results: - results.sort(key=lambda x: x.confidence, reverse=True) - return results[0] - - # If absolutely nothing found, return None or an 'Unable' result - return comparator.create_unable_result(entry, "No metadata found in any source") + """Look up `entry` across all available sources in parallel and return a single ComparisonResult.""" + has_doi = bool(getattr(entry, "doi", "") or "") + has_arxiv = bool(getattr(entry, "has_arxiv", False)) + has_title = bool(getattr(entry, "title", "") or "") + + if not (has_doi or has_arxiv or has_title): + return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up") + + # ------------------------------------------------------------------ stage 1 + # Tasks are tuples of (source_name, callable returning ComparisonResult or None). + tasks: list[tuple[str, callable]] = [] + + # Identifier-based lookups (high precision). + if has_doi and crossref_fetcher: + def _t_cr_doi(e=entry): + r = crossref_fetcher.search_by_doi(e.doi) + return comparator.compare_with_crossref(e, r) if r else None + tasks.append(("crossref(doi)", _t_cr_doi)) + + if has_doi and semantic_scholar_fetcher: + def _t_s2_doi(e=entry): + r = semantic_scholar_fetcher.fetch_by_doi(e.doi) + return comparator.compare_with_semantic_scholar(e, r) if r else None + tasks.append(("s2(doi)", _t_s2_doi)) + + if has_doi and openalex_fetcher: + def _t_oa_doi(e=entry): + r = openalex_fetcher.fetch_by_doi(e.doi) + return comparator.compare_with_openalex(e, r) if r else None + tasks.append(("openalex(doi)", _t_oa_doi)) + + if has_arxiv and arxiv_fetcher: + def _t_arxiv_id(e=entry): + r = arxiv_fetcher.fetch_by_id(e.arxiv_id) + return comparator.compare_with_arxiv(e, r) if r else None + tasks.append(("arxiv(id)", _t_arxiv_id)) + + if has_arxiv and semantic_scholar_fetcher and not has_doi: + # If we already queried S2 by DOI we don't double-bill. + def _t_s2_arxiv(e=entry): + r = semantic_scholar_fetcher.fetch_by_arxiv_id(e.arxiv_id) + return comparator.compare_with_semantic_scholar(e, r) if r else None + tasks.append(("s2(arxiv)", _t_s2_arxiv)) + + # Title-based lookups (always run as corroboration if title available). + if has_title: + if semantic_scholar_fetcher and not has_doi and not has_arxiv: + def _t_s2_title(e=entry): + cands = semantic_scholar_fetcher.search_by_title_multi(e.title, max_results=5) + best, _ = _pick_best_candidate(e.title, cands) + return comparator.compare_with_semantic_scholar(e, best) if best else None + tasks.append(("s2(title)", _t_s2_title)) + + if openalex_fetcher and not has_doi: + def _t_oa_title(e=entry): + cands = openalex_fetcher.search_by_title_multi(e.title, max_results=5) + best, _ = _pick_best_candidate(e.title, cands) + return comparator.compare_with_openalex(e, best) if best else None + tasks.append(("openalex(title)", _t_oa_title)) + + if dblp_fetcher: + def _t_dblp_title(e=entry): + cands = dblp_fetcher.search_by_title_multi(e.title, max_results=5) + best, _ = _pick_best_candidate(e.title, cands) + return comparator.compare_with_dblp(e, best) if best else None + tasks.append(("dblp(title)", _t_dblp_title)) + + if crossref_fetcher and not has_doi: + def _t_cr_title(e=entry): + cands = crossref_fetcher.search_by_title_multi(e.title, max_results=5) + best, _ = _pick_best_candidate(e.title, cands) + return comparator.compare_with_crossref(e, best) if best else None + tasks.append(("crossref(title)", _t_cr_title)) + + if arxiv_fetcher and not has_arxiv: + def _t_arxiv_title(e=entry): + cands = arxiv_fetcher.search_by_title(e.title, max_results=5) + best, _ = _pick_best_candidate(e.title, cands) + return comparator.compare_with_arxiv(e, best) if best else None + tasks.append(("arxiv(title)", _t_arxiv_title)) + + if not tasks: + return comparator.create_unable_result(entry, "No fetchers configured") + + # Run in parallel with EARLY EXIT. + # + # Strategy: + # - Submit every task to a pool. + # - Drain `as_completed` with a SHORT poll deadline. + # - Stop early as soon as we have one high-confidence match (β‰₯0.85) + # plus at least one corroborating result whose title aligns. + # - Hard ceiling: 18s total wall-clock per entry. Whatever finished + # by then is what we use; the rest is cancelled so we don't pay + # the slowest-source penalty (a 80s-rate-limited S2 retry, e.g.). + results: list = [] + sources_tried: list[str] = [] + entry_key = getattr(entry, "key", "") + deadline = __import__("time").monotonic() + 18.0 + HIGH_CONF = 0.85 + + def _have_corroborated(rs: list) -> bool: + if not rs: + return False + rs_sorted = sorted(rs, key=lambda r: r.confidence, reverse=True) + primary = rs_sorted[0] + if primary.confidence < HIGH_CONF: + return False + for other in rs_sorted[1:]: + if other.fetched_title and _title_sim(primary.fetched_title, + other.fetched_title) >= _TITLE_AGREE: + return True + return False + + pool = cf.ThreadPoolExecutor(max_workers=min(8, len(tasks))) + future_to_name = {pool.submit(fn): name for name, fn in tasks} + try: + pending = set(future_to_name) + while pending: + remaining = deadline - __import__("time").monotonic() + if remaining <= 0: + logger.debug("Entry=%s: 18s deadline reached, %d sources still pending", + entry_key, len(pending)) + break + done, pending = cf.wait(pending, timeout=min(remaining, 2.0), + return_when=cf.FIRST_COMPLETED) + for fut in done: + name = future_to_name[fut] + sources_tried.append(name) + try: + r = fut.result(timeout=0) + except Exception as e: + logger.warning( + "Lookup failed for entry=%s source=%s: %s", + entry_key, name, e, exc_info=True, + ) + continue + if r is not None: + results.append(r) + if _have_corroborated(results): + logger.debug("Entry=%s: corroborated early after %d sources", entry_key, len(results)) + break + finally: + # Cancel anything still in the queue; threads already running can't + # be killed, but they'll finish quietly without blocking us. + for fut in future_to_name: + if not fut.done(): + fut.cancel() + pool.shutdown(wait=False, cancel_futures=True) + + if not results: + return comparator.create_unable_result( + entry, + f"Tried {len(tasks)} sources ({', '.join(sources_tried) or 'none'}) β€” no metadata returned" + ) + + # ------------------------------------------------------------------ stage 2: pick + corroborate + # Sort by confidence; pick top. + results.sort(key=lambda r: r.confidence, reverse=True) + primary = results[0] + + # Count corroborating sources that report a title within sim β‰₯ _TITLE_AGREE + # of the primary's fetched_title. + primary_title = primary.fetched_title + agree_count = 0 + distinct_sources = set() + for r in results: + if r is primary: + continue + if not r.fetched_title: + continue + if _title_sim(primary_title, r.fetched_title) >= _TITLE_AGREE: + agree_count += 1 + distinct_sources.add(r.source) + + # ------------------------------------------------------------------ stage 3: refine match decision + # Tighten / loosen `is_match` based on corroboration + year tolerance. + title_ok_tight = primary.title_similarity >= _TITLE_MATCH_TIGHT + year_ok_loose = _year_close(primary.bib_year, primary.fetched_year) + + if agree_count >= 1 and title_ok_tight: + primary.is_match = True + elif title_ok_tight and primary.author_match and year_ok_loose: + primary.is_match = True + elif primary.is_match and not (title_ok_tight and year_ok_loose): + # Original heuristic said match but our stricter rule disagrees. + primary.is_match = False + if not any("stricter check" in i.lower() for i in primary.issues): + primary.issues.append( + "Marked unverified by stricter check (title/year tolerance not met)." + ) + + # Boost / annotate confidence with corroboration signal. + if agree_count >= 1: + # Each corroborating source bumps confidence toward 1.0. + bonus = min(0.25, 0.1 + 0.05 * agree_count) + primary.confidence = min(1.0, primary.confidence + bonus) + # Positive note β€” goes to `notes`, NOT `issues`. Otherwise verified + # entries would display a misleading "1 issue(s)" badge. + primary.notes.append( + f"Corroborated by {agree_count} other source(s): {', '.join(sorted(distinct_sources))}." + ) + + # Year-only mismatch with otherwise solid match: drop the hard issue + # and record a soft note instead (preprint/published year difference). + if (primary.title_match and primary.author_match and not primary.year_match + and year_ok_loose and primary.bib_year and primary.fetched_year): + primary.issues = [ + i for i in primary.issues if not i.startswith("Year mismatch") + ] + primary.notes.append( + f"Year differs by ≀1 ({primary.bib_year} vs {primary.fetched_year}) β€” " + "likely preprint/published difference, treated as match." + ) + + return primary diff --git a/bibguard.yaml b/bibguard.yaml index 7dc574f5a798d32b96cd2b2e5a307e0473318e97..633591e0e4ea81c000a181cb77e52f5f10fe0bc5 100644 --- a/bibguard.yaml +++ b/bibguard.yaml @@ -27,6 +27,23 @@ files: output_dir: "test" +# ============================================================================== +# 🌐 Network / Politeness +# ============================================================================== +network: + # Real email used in User-Agent for arXiv/CrossRef/OpenAlex polite-pool requests. + # arXiv's robots policy asks for a real contact. Strongly recommended to fill in. + contact_email: "" + + # Cache HTTP responses to a local SQLite DB. Same `entry.key` won't re-hit network + # within the TTL window. Hugely speeds up re-runs. + cache_enabled: true + cache_ttl_hours: 24 + + # Auto-retry on 429/5xx with exponential backoff. + retry_total: 5 + retry_backoff_factor: 1.5 + # ============================================================================== # πŸŽ“ Conference Template # ============================================================================== @@ -59,7 +76,7 @@ bibliography: # Relevance Assessment - Use LLM to evaluate if citations match their context # Requires LLM configuration (see llm section below). Disabled by default due to API costs. - check_relevance: false + check_relevance: true # ============================================================================== # πŸ“‹ Submission Quality Checks @@ -125,6 +142,21 @@ submission: # Detects GitHub links, acknowledgments, self-citations that may reveal author identity anonymization: true +# ============================================================================== +# 🌐 Network-Bound Bibliography Checks +# ============================================================================== +# These run only when explicitly enabled. Both operate solely on bib entries +# that carry the relevant field (no DOI β‡’ retraction skipped, no url= β‡’ +# liveness skipped). The web UI's "Strict" preset turns both on. +submission_extra: + # URL Liveness - HEAD-then-GET every entry.url to find dead links. + # Slow on large bibs (one HTTP roundtrip per URL); off by default. + url_liveness: false + + # Retractions - Look up every entry.doi against CrossRef's update-to relation + # to flag retracted, withdrawn, or "expression of concern" papers. + retraction: true + # ============================================================================== # πŸ” Metadata Check Workflow # ============================================================================== @@ -133,7 +165,7 @@ submission: # Set enabled: false to skip a particular source. workflow: - name: arxiv_id - enabled: true + enabled: false description: "Lookup by arXiv ID (fastest, most reliable for preprints)" - name: crossref_doi @@ -153,7 +185,7 @@ workflow: description: "OpenAlex API (broad coverage across disciplines)" - name: arxiv_title - enabled: true + enabled: false description: "Search arXiv by title (fallback when ID unavailable)" - name: crossref_title @@ -171,17 +203,18 @@ llm: # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek # Each backend requires different setup (API keys, local installation, etc.) backend: "gemini" - + # Model name (leave empty to use backend default) - # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3" + # Examples: "gpt-4o-mini", "claude-haiku-4-5-20251001", "gemini-2.5-flash", "llama3" model: "" # API endpoint (leave empty to use backend default) # Only needed for self-hosted models (vllm, ollama) or custom endpoints endpoint: "" - # API key (recommended to use environment variables instead) - # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment + # API key (RECOMMENDED: leave empty and use environment variables instead) + # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY, etc. + # in your shell. BibGuard will read from $_API_KEY automatically. api_key: "" # ============================================================================== diff --git a/main.py b/main.py index f77d35e1cd9fd687e074ffd493835a0c80fd1e36..c0b11ab97d0d6c7c383069812bb998976ad0bd93 100644 --- a/main.py +++ b/main.py @@ -7,8 +7,12 @@ Usage: python main.py --config my.yaml # Use specified config file python main.py --init # Create default config file python main.py --list-templates # List available templates + python main.py --quick # Skip network-bound metadata/relevance/url checks + python main.py --format json,html,markdown + python main.py --verbose # DEBUG-level logs to stderr """ import argparse +import logging import sys from pathlib import Path from typing import Optional, List @@ -19,10 +23,17 @@ from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, Duplic from src.analyzers.llm_evaluator import LLMBackend from src.report.generator import ReportGenerator, EntryReport from src.utils.progress import ProgressDisplay +from src.utils.logging_setup import setup as setup_logging +from src.utils import http as http_layer +from src.utils.validation import validate_bib, validate_tex, format_report from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow from src.templates.base_template import get_template, get_all_templates from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity +from src.checkers.retraction_checker import RetractionChecker +from src.checkers.url_checker import URLChecker + +logger = logging.getLogger("bibguard") def main(): @@ -52,8 +63,24 @@ Usage Examples: action="store_true", help="List all available conference templates" ) - + parser.add_argument( + "--quick", + action="store_true", + help="Skip network-bound checks (metadata, retraction, URL liveness, LLM)", + ) + parser.add_argument( + "--format", + default=None, + help="Comma-separated list of output formats (markdown, html, json). Defaults to config.", + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Verbose (DEBUG) logging to stderr", + ) + args = parser.parse_args() + setup_logging("DEBUG" if args.verbose else None) # Handle --init if args.init: @@ -95,25 +122,43 @@ Usage Examples: print(f"Error: Failed to parse config file: {e}") sys.exit(1) + # CLI overrides + if args.quick: + config.bibliography.check_metadata = False + config.bibliography.check_relevance = False + config.submission_extra.url_liveness = False + config.submission_extra.retraction = False + if args.format: + config.output.formats = [s.strip() for s in args.format.split(",") if s.strip()] + + # Configure shared HTTP layer (retry + cache + UA) + http_layer.configure( + contact_email=config.network.contact_email, + cache_enabled=config.network.cache_enabled, + cache_ttl_hours=config.network.cache_ttl_hours, + retry_total=config.network.retry_total, + retry_backoff_factor=config.network.retry_backoff_factor, + ) + # Validate required fields mode_dir = bool(config.files.input_dir) - + if mode_dir: input_dir = config.input_dir_path if not input_dir.exists() or not input_dir.is_dir(): print(f"Error: Input directory does not exist or is not a directory: {input_dir}") sys.exit(1) - + tex_files = list(input_dir.rglob("*.tex")) bib_files = list(input_dir.rglob("*.bib")) - + if not tex_files: print(f"Error: No .tex files found in {input_dir}") sys.exit(1) if not bib_files: print(f"Error: No .bib files found in {input_dir}") sys.exit(1) - + config._tex_files = tex_files config._bib_files = bib_files else: @@ -123,7 +168,7 @@ Usage Examples: if not config.files.tex: print("Error: tex file path not specified in config") sys.exit(1) - + # Validate files exist if not config.bib_path.exists(): print(f"Error: Bib file does not exist: {config.bib_path}") @@ -131,10 +176,29 @@ Usage Examples: if not config.tex_path.exists(): print(f"Error: TeX file does not exist: {config.tex_path}") sys.exit(1) - + config._tex_files = [config.tex_path] config._bib_files = [config.bib_path] - + + # Pre-flight content validation (R6) + any_fatal = False + for bp in config._bib_files: + rep = validate_bib(bp) + msg = format_report(rep, label=bp.name) + if msg: + print(msg) + if not rep.ok: + any_fatal = True + for tp in config._tex_files: + rep = validate_tex(tp) + msg = format_report(rep, label=tp.name) + if msg: + print(msg) + if not rep.ok: + any_fatal = True + if any_fatal: + sys.exit(1) + # Load template if specified template = None if config.template: @@ -143,12 +207,12 @@ Usage Examples: print(f"Error: Unknown template: {config.template}") print("Use --list-templates to see available templates") sys.exit(1) - + # Run the checker try: run_checker(config, template) except KeyboardInterrupt: - print("\n\nCancelled") + print("\n\n[BibGuard] Interrupted. Partial reports (if any) are in the output dir.") sys.exit(130) except Exception as e: print(f"\nError: {e}") @@ -250,32 +314,62 @@ def run_checker(config: BibGuardConfig, template=None): [str(f) for f in config._tex_files] ) + # Build the per-checker config dict (glossary, template, etc.) + checker_config = { + "glossary_preferred": config.glossary.preferred, + "glossary_acronyms": config.glossary.acronyms, + "template": template, + } + # Run submission quality checks submission_results = [] - enabled_checkers = config.submission.get_enabled_checkers() - + enabled_checkers = list(config.submission.get_enabled_checkers()) + if template is not None and "template" not in enabled_checkers: + enabled_checkers.append("template") + for checker_name in enabled_checkers: if checker_name in CHECKER_REGISTRY: checker = CHECKER_REGISTRY[checker_name]() for tex_path_str, content in tex_contents.items(): - results = checker.check(content, {}) - # Tag results with file path - for r in results: - r.file_path = tex_path_str + # Run the checker on this file. We deliberately do NOT tag + # `r.file_path = tex_path_str` because user-facing reports + # never expose local tex paths (basename or full). + results = checker.check(content, checker_config) submission_results.extend(results) - + # Set results in report generator for summary calculation report_gen.set_submission_results(submission_results, template) - + # Check for duplicates (silent) if bib_config.check_duplicates and duplicate_detector: duplicate_groups = duplicate_detector.find_duplicates(entries) report_gen.set_duplicate_groups(duplicate_groups) - + # Check missing citations (silent) if bib_config.check_usage and usage_checker: missing = usage_checker.get_missing_entries(entries) report_gen.set_missing_citations(missing) + + # Retraction lookups (F1) + if config.submission_extra.retraction: + try: + findings = RetractionChecker().check_entries(entries) + report_gen.set_retraction_findings(findings) + if findings: + logger.info("Retraction check found %d flagged entries", len(findings)) + except Exception as e: + logger.debug("Retraction check failed: %s", e) + + # URL liveness (F2) + if config.submission_extra.url_liveness: + try: + url_findings = URLChecker().check_entries(entries) + report_gen.set_url_findings(url_findings) + broken = sum(1 for f in url_findings if f.status != "ok") + if broken: + logger.info("URL liveness check: %d broken URL(s)", broken) + except Exception as e: + logger.debug("URL liveness check failed: %s", e) # Process entries @@ -347,41 +441,46 @@ def run_checker(config: BibGuardConfig, template=None): # Determine number of workers (max 10 to avoid overwhelming APIs) max_workers = min(10, len(entries)) + interrupted = False with progress.progress_context(len(entries), "Processing bibliography") as prog: # Use ThreadPoolExecutor for parallel processing with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries} - + # Process completed tasks - for future in as_completed(future_to_entry): - entry = future_to_entry[future] - try: - entry_report, comparison_result = future.result() - - # Thread-safe progress update - with progress_lock: - report_gen.add_entry_report(entry_report) - - # Update progress - if comparison_result and comparison_result.is_match: - prog.mark_success() - elif comparison_result and comparison_result.has_issues: - prog.mark_warning() - else: + try: + for future in as_completed(future_to_entry): + entry = future_to_entry[future] + try: + entry_report, comparison_result = future.result() + + # Thread-safe progress update + with progress_lock: + report_gen.add_entry_report(entry_report) + + # Update progress + if comparison_result and comparison_result.is_match: + prog.mark_success() + elif comparison_result and comparison_result.has_issues: + prog.mark_warning() + else: + prog.mark_error() + + completed_count[0] += 1 + prog.update(entry.key, "Done", 1) + + except Exception as e: + with progress_lock: prog.mark_error() - - completed_count[0] += 1 - prog.update(entry.key, "Done", 1) - - except Exception as e: - with progress_lock: - prog.mark_error() - progress.print_error(f"Error processing {entry.key}: {e}") - completed_count[0] += 1 - prog.update(entry.key, "Failed", 1) - - # Summary will be printed at the very end + progress.print_error(f"Error processing {entry.key}: {e}") + completed_count[0] += 1 + prog.update(entry.key, "Failed", 1) + except KeyboardInterrupt: + interrupted = True + logger.warning("Interrupted by user; cancelling remaining work and saving partial reports") + for f in future_to_entry: + f.cancel() # Generate reports and organize outputs (silent) @@ -395,61 +494,55 @@ def run_checker(config: BibGuardConfig, template=None): shutil.copy2(bib_path, output_dir / bib_path.name) for tex_path in config._tex_files: shutil.copy2(tex_path, output_dir / tex_path.name) - # 1. Bibliography Report - bib_report_path = output_dir / "bibliography_report.md" - report_gen.save_bibliography_report(str(bib_report_path)) - - # 2. LaTeX Quality Report - if submission_results: - latex_report_path = output_dir / "latex_quality_report.md" - report_gen.save_latex_quality_report( - str(latex_report_path), - submission_results, - template - ) - - # 3. Line-by-Line Report - from src.report.line_report import generate_line_report - line_report_path = output_dir / "line_by_line_report.md" - - # For multiple files, we generate one big report with sections - all_line_reports = [] - for tex_path_str, content in tex_contents.items(): - file_results = [r for r in submission_results if r.file_path == tex_path_str] - if not file_results: - continue - - from src.report.line_report import LineByLineReportGenerator - gen = LineByLineReportGenerator(content, tex_path_str) - gen.add_results(file_results) - all_line_reports.append(gen.generate()) - - if all_line_reports: - with open(line_report_path, 'w', encoding='utf-8') as f: - f.write("\n\n".join(all_line_reports)) - - # 4. Clean bib file (if generated earlier) + requested_formats = {f.lower() for f in (config.output.formats or ["markdown", "html"])} + + # 1. Bibliography Report (markdown) + if "markdown" in requested_formats: + bib_report_path = output_dir / "bibliography_report.md" + report_gen.save_bibliography_report(str(bib_report_path)) + + # 2. LaTeX Quality Report (markdown) + if submission_results: + latex_report_path = output_dir / "latex_quality_report.md" + report_gen.save_latex_quality_report( + str(latex_report_path), + submission_results, + template, + ) + + # 4. Self-contained HTML (β˜…) + if "html" in requested_formats: + try: + report_gen.save_html(str(output_dir / "report.html")) + except Exception as e: + logger.warning("Failed to write HTML report: %s", e) + + # 5. JSON output + if "json" in requested_formats: + try: + report_gen.save_json(str(output_dir / "report.json")) + except Exception as e: + logger.warning("Failed to write JSON report: %s", e) + + # 6. Clean bib file (if generated earlier) if bib_config.check_usage and usage_checker: used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used] if used_entries: try: keys_to_keep = {entry.key for entry in used_entries} - # If multiple bibs, we merge them into one cleaned file - # or just use the first one if it's single mode. - # For now, let's just use a default name if multiple. if len(config._bib_files) == 1: clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib" bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep) else: clean_bib_path = output_dir / "merged_only_used.bib" - # We need a way to filter multiple files into one. - # BibParser.filter_file currently takes one input. - # Let's just write all used entries to a new file. with open(clean_bib_path, 'w', encoding='utf-8') as f: for entry in used_entries: - f.write(entry.raw + "\n\n") + f.write(getattr(entry, "raw", "") + "\n\n") except Exception as e: - pass + logger.debug("Failed to write cleaned bib file: %s", e) + + if interrupted: + print("[BibGuard] Saved partial reports for completed entries.") # Print beautiful console summary if not config.output.quiet: @@ -461,85 +554,40 @@ def fetch_and_compare_with_workflow( entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher, semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator ): - """Fetch metadata from online sources using the configured workflow.""" - from src.utils.normalizer import TextNormalizer - - all_results = [] - enabled_steps = workflow_config.get_enabled_steps() - - for step in enabled_steps: - result = None - - if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher: - arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id) - if arxiv_meta: - result = comparator.compare_with_arxiv(entry, arxiv_meta) - - elif step.name == "crossref_doi" and entry.doi and crossref_fetcher: - crossref_result = crossref_fetcher.search_by_doi(entry.doi) - if crossref_result: - result = comparator.compare_with_crossref(entry, crossref_result) - - elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher: - ss_result = None - if entry.doi: - ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi) - if not ss_result: - ss_result = semantic_scholar_fetcher.search_by_title(entry.title) - if ss_result: - result = comparator.compare_with_semantic_scholar(entry, ss_result) - - elif step.name == "dblp" and entry.title and dblp_fetcher: - dblp_result = dblp_fetcher.search_by_title(entry.title) - if dblp_result: - result = comparator.compare_with_dblp(entry, dblp_result) - - elif step.name == "openalex" and entry.title and openalex_fetcher: - oa_result = None - if entry.doi: - oa_result = openalex_fetcher.fetch_by_doi(entry.doi) - if not oa_result: - oa_result = openalex_fetcher.search_by_title(entry.title) - if oa_result: - result = comparator.compare_with_openalex(entry, oa_result) - - elif step.name == "arxiv_title" and entry.title and arxiv_fetcher: - results = arxiv_fetcher.search_by_title(entry.title, max_results=3) - if results: - best_result = None - best_sim = 0.0 - norm1 = TextNormalizer.normalize_for_comparison(entry.title) - - for r in results: - norm2 = TextNormalizer.normalize_for_comparison(r.title) - sim = TextNormalizer.similarity_ratio(norm1, norm2) - if sim > best_sim: - best_sim = sim - best_result = r - - if best_result and best_sim > 0.5: - result = comparator.compare_with_arxiv(entry, best_result) - - elif step.name == "crossref_title" and entry.title and crossref_fetcher: - crossref_result = crossref_fetcher.search_by_title(entry.title) - if crossref_result: - result = comparator.compare_with_crossref(entry, crossref_result) - - elif step.name == "google_scholar" and entry.title and scholar_fetcher: + """ + Fetch metadata across all configured sources and pick the best match. + + Delegates the heavy lifting to ``app_helper.fetch_and_compare_with_workflow``, + which runs identifier-based and title-based lookups in parallel and uses + cross-source corroboration to decide is_match. Google Scholar is consulted + only as a last-resort fallback because scraping is fragile and frequently + blocked. + """ + from app_helper import fetch_and_compare_with_workflow as _parallel_lookup + + primary = _parallel_lookup( + entry, workflow_config, arxiv_fetcher, crossref_fetcher, + semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator, + ) + + if primary and primary.source != "unable": + return primary + + # Last-resort Google Scholar fallback (web scraping; frequently blocked). + if entry.title and scholar_fetcher: + try: scholar_result = scholar_fetcher.search_by_title(entry.title) if scholar_result: - result = comparator.compare_with_scholar(entry, scholar_result) - - if result: - all_results.append(result) - if result.is_match: - return result - - if all_results: - all_results.sort(key=lambda r: r.confidence, reverse=True) - return all_results[0] - - return comparator.create_unable_result(entry, "Unable to find this paper in any data source") + return comparator.compare_with_scholar(entry, scholar_result) + except Exception as e: + logger.warning( + "Google Scholar fallback failed for entry=%s: %s", + getattr(entry, "key", ""), e, exc_info=True, + ) + + return primary or comparator.create_unable_result( + entry, "Unable to find this paper in any data source" + ) def get_abstract(entry, comparison_result, arxiv_fetcher): diff --git a/requirements.txt b/requirements.txt index 1a3952441f19a7f9d91a2dd8d613fe38858cc019..7d8d60d85e14b204940e514ae605b1aeab53676f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ gradio>=6.0.0 bibtexparser>=1.4.0 requests>=2.31.0 +requests-cache>=1.2.0 beautifulsoup4>=4.12.0 rich>=13.7.0 Unidecode>=1.3.0 diff --git a/scripts/install-hook.sh b/scripts/install-hook.sh new file mode 100755 index 0000000000000000000000000000000000000000..166db91128edf60e78d872eb6c3e5e663221d27b --- /dev/null +++ b/scripts/install-hook.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Install a git pre-commit hook that runs BibGuard in --quick mode whenever +# the staged changes touch .bib or .tex files. +# +# Usage (run from the repo root that contains your paper, NOT BibGuard's repo): +# bash /path/to/BibGuard/scripts/install-hook.sh +# +# Skip the hook for one commit: git commit --no-verify +set -euo pipefail + +if ! git rev-parse --git-dir >/dev/null 2>&1; then + echo "Error: not inside a git repo." >&2 + exit 1 +fi + +HOOK_DIR="$(git rev-parse --git-dir)/hooks" +HOOK="$HOOK_DIR/pre-commit" + +# Locate BibGuard's main.py β€” we assume this script lives in BibGuard/scripts/. +BIBGUARD_DIR="$(cd "$(dirname "$0")/.." && pwd)" +MAIN_PY="$BIBGUARD_DIR/main.py" +if [[ ! -f "$MAIN_PY" ]]; then + echo "Error: cannot locate BibGuard main.py at $MAIN_PY" >&2 + exit 1 +fi + +mkdir -p "$HOOK_DIR" + +if [[ -f "$HOOK" ]]; then + echo "A pre-commit hook already exists at $HOOK" + echo "Backing it up to $HOOK.bibguard-backup" + mv "$HOOK" "$HOOK.bibguard-backup" +fi + +cat >"$HOOK" < Optional[Tuple[LLMBackend, str]]: + """ + Find the first backend that has credentials in the environment. + + Returns (backend, api_key) or None. For Ollama we attempt a localhost + probe so users with `ollama serve` running get auto-selected with no + config. + """ + for backend in _AUTODETECT_ORDER: + env = _BACKEND_ENV.get(backend, "") + if env: + key = os.environ.get(env, "").strip() + if key: + return backend, key + elif backend == LLMBackend.OLLAMA: + # Local probe β€” small timeout so absence isn't painful. + try: + r = requests.get("http://localhost:11434/api/tags", timeout=1.0) + if r.status_code == 200: + return backend, "" + except requests.RequestException: + continue + return None + + @dataclass class EvaluationResult: """Result of LLM citation evaluation.""" @@ -30,15 +80,16 @@ class EvaluationResult: explanation: str context_used: str abstract_used: str + citation_role: str = "" # baseline | method | dataset | counterexample | survey | motivation | other line_number: Optional[int] = None file_path: Optional[str] = None error: Optional[str] = None - + @property def score_label(self) -> str: labels = { 1: "Not Relevant", - 2: "Marginally Relevant", + 2: "Marginally Relevant", 3: "Somewhat Relevant", 4: "Relevant", 5: "Highly Relevant" @@ -49,7 +100,7 @@ class EvaluationResult: class LLMEvaluator: """Evaluates citation relevance using LLM.""" - PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant. + PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant, and identify the citation's role in the manuscript. ## Citation Context (from the manuscript): {context} @@ -62,23 +113,28 @@ Evaluate the relevance and appropriateness of this citation. Consider: 1. Does the citation support the claim being made in the context? 2. Is the cited paper's topic related to the discussion? 3. Is this citation necessary, or could it be replaced with a more relevant one? +4. What is the *role* of this citation in the manuscript? + +## Citation roles (pick exactly one): +- "baseline": cited paper is used/compared as a baseline or prior method. +- "method": cited paper introduces a method that the manuscript builds on or uses directly. +- "dataset": cited paper provides a dataset/benchmark the manuscript uses. +- "counterexample": cited to show a contrary finding or argue against. +- "survey": cited as a survey/overview reference. +- "motivation": cited to motivate the problem (background, application, statistics). +- "other": none of the above clearly applies. ## Response Format: -Provide your response in the following JSON format: +Respond with ONE JSON object, no other text: {{ - "relevance_score": <1-5 integer>, - "is_relevant": , - "explanation": "" + "relevance_score": , + "is_relevant": , + "citation_role": "", + "explanation": "<1-2 sentences>" }} -Score guide: -- 1: Not relevant at all -- 2: Marginally relevant -- 3: Somewhat relevant -- 4: Relevant and appropriate -- 5: Highly relevant and essential - -STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text.""" +Score guide: 1=Not relevant, 2=Marginally, 3=Somewhat, 4=Relevant, 5=Highly relevant. +STRICTLY FOLLOW THE JSON FORMAT.""" def __init__( self, @@ -90,28 +146,32 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex self.backend = backend self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY") - # Set defaults based on backend + # Set defaults based on backend (cheap, fast models that exist) if backend == LLMBackend.OPENAI: self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions" - self.model = model or "gpt-5-mini" + self.model = model or "gpt-4o-mini" elif backend == LLMBackend.ANTHROPIC: self.endpoint = endpoint or "https://api.anthropic.com/v1/messages" - self.model = model or "claude-4.5-haiku" + self.model = model or "claude-haiku-4-5-20251001" elif backend == LLMBackend.DEEPSEEK: self.endpoint = endpoint or "https://api.deepseek.com/chat/completions" self.model = model or "deepseek-chat" elif backend == LLMBackend.OLLAMA: self.endpoint = endpoint or "http://localhost:11434/api/generate" - self.model = model or "Qwen/qwen3-4B-Instruct-2507" + self.model = model or "qwen2.5:3b-instruct" elif backend == LLMBackend.VLLM: self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions" - self.model = model or "Qwen/qwen3-4B-Instruct-2507" + self.model = model or "Qwen/Qwen2.5-3B-Instruct" elif backend == LLMBackend.GEMINI: self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models" - self.model = model or "gemini-2.5-flash-lite" + self.model = model or "gemini-2.5-flash" + # Retry config for transient LLM failures (rate limits, server errors, JSON issues). + MAX_ATTEMPTS = 3 + RETRY_BASE_DELAY = 1.5 # seconds, exponential + def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult: - """Evaluate citation relevance.""" + """Evaluate citation relevance with retries on transient errors.""" if not context or not abstract: return EvaluationResult( entry_key=entry_key, @@ -122,34 +182,51 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex abstract_used=abstract, error="Missing context or abstract for evaluation" ) - - # Don't truncate - preserve full context and abstract + prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract) - - try: - if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM): - response = self._call_openai_compatible(prompt) - elif self.backend == LLMBackend.ANTHROPIC: - response = self._call_anthropic(prompt) - elif self.backend == LLMBackend.OLLAMA: - response = self._call_ollama(prompt) - elif self.backend == LLMBackend.GEMINI: - response = self._call_gemini(prompt) - else: - raise ValueError(f"Unknown backend: {self.backend}") - - return self._parse_response(entry_key, response, context, abstract) - - except Exception as e: - return EvaluationResult( - entry_key=entry_key, - relevance_score=0, - is_relevant=False, - explanation="", - context_used=context, - abstract_used=abstract, - error=str(e) - ) + + last_err: Optional[str] = None + for attempt in range(1, self.MAX_ATTEMPTS + 1): + try: + if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM): + response = self._call_openai_compatible(prompt) + elif self.backend == LLMBackend.ANTHROPIC: + response = self._call_anthropic(prompt) + elif self.backend == LLMBackend.OLLAMA: + response = self._call_ollama(prompt) + elif self.backend == LLMBackend.GEMINI: + response = self._call_gemini(prompt) + else: + raise ValueError(f"Unknown backend: {self.backend}") + + parsed = self._parse_response(entry_key, response, context, abstract) + # Successful structured parse β†’ return. + if parsed.error is None: + return parsed + # JSON parse failed β€” retry with the same prompt; LLM jitter + # often resolves on a second pass. + last_err = parsed.error + except requests.exceptions.RequestException as e: + last_err = f"network: {e}" + # Transient: retry with backoff. + except Exception as e: + last_err = str(e) + + if attempt < self.MAX_ATTEMPTS: + delay = self.RETRY_BASE_DELAY * (2 ** (attempt - 1)) + logger.debug("LLM attempt %d/%d failed (%s); retrying in %.1fs", + attempt, self.MAX_ATTEMPTS, last_err, delay) + time.sleep(delay) + + return EvaluationResult( + entry_key=entry_key, + relevance_score=0, + is_relevant=False, + explanation="", + context_used=context, + abstract_used=abstract, + error=last_err or "Unknown error after retries" + ) def _call_openai_compatible(self, prompt: str) -> str: """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM).""" @@ -272,24 +349,77 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex return parts[0].get("text", "") return "" - def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult: - """Parse LLM response.""" - # Try to extract JSON from response - json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL) - - data = {} - if not json_match: - # Try to parse the whole response as JSON - try: - data = json.loads(response.strip()) - except json.JSONDecodeError: - pass - else: + @staticmethod + def _extract_json_object(text: str) -> Optional[dict]: + """ + Robust JSON extraction. Handles: + - bare JSON + - fenced ```json ... ``` blocks + - JSON embedded in surrounding prose + - nested objects (the simple `\\{[^{}]*\\}` regex misses these) + """ + if not text: + return None + s = text.strip() + + # Direct parse + try: + obj = json.loads(s) + if isinstance(obj, dict): + return obj + except json.JSONDecodeError: + pass + + # Strip Markdown code fences (```json ... ``` or ``` ... ```) + fence_match = re.search(r"```(?:json)?\s*(.*?)```", s, re.DOTALL | re.IGNORECASE) + if fence_match: + inner = fence_match.group(1).strip() try: - data = json.loads(json_match.group()) + obj = json.loads(inner) + if isinstance(obj, dict): + return obj except json.JSONDecodeError: pass - + s = inner # fall through to brace-balance scan on inner + + # Brace-balanced scan: find the first complete top-level {...}. + start = s.find("{") + while start != -1: + depth = 0 + in_str = False + esc = False + for i in range(start, len(s)): + ch = s[i] + if esc: + esc = False + continue + if ch == "\\": + esc = True + continue + if ch == '"': + in_str = not in_str + continue + if in_str: + continue + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + chunk = s[start:i + 1] + try: + obj = json.loads(chunk) + if isinstance(obj, dict): + return obj + except json.JSONDecodeError: + break + start = s.find("{", start + 1) + return None + + def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult: + """Parse LLM response with robust JSON extraction.""" + data = self._extract_json_object(response) or {} + if not data: return EvaluationResult( entry_key=entry_key, @@ -301,27 +431,44 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex error="Failed to parse LLM response as JSON" ) - # Extract fields - relevance_score = data.get("relevance_score", 0) - if isinstance(relevance_score, str): - try: - relevance_score = int(relevance_score) - except ValueError: - relevance_score = 0 - - is_relevant = data.get("is_relevant", False) + # Extract & validate fields + raw_score = data.get("relevance_score", data.get("score", 0)) + try: + relevance_score = int(float(raw_score)) + except (TypeError, ValueError): + relevance_score = 0 + relevance_score = max(0, min(5, relevance_score)) + + is_relevant = data.get("is_relevant", relevance_score >= 4) if isinstance(is_relevant, str): - is_relevant = is_relevant.lower() in ("true", "yes", "1") - - explanation = data.get("explanation", "") - + is_relevant = is_relevant.strip().lower() in ("true", "yes", "1", "y") + + explanation = str(data.get("explanation", data.get("reason", ""))).strip() + citation_role = str(data.get("citation_role", data.get("role", ""))).strip().lower() or "other" + if citation_role not in {"baseline", "method", "dataset", "counterexample", "survey", "motivation", "other"}: + citation_role = "other" + + # Sanity: a score of 0 means the LLM didn't actually return one β€” flag it. + if relevance_score == 0: + return EvaluationResult( + entry_key=entry_key, + relevance_score=0, + is_relevant=False, + explanation=explanation or response, + context_used=context, + abstract_used=abstract, + citation_role=citation_role, + error="LLM did not return a usable relevance_score", + ) + return EvaluationResult( entry_key=entry_key, relevance_score=relevance_score, is_relevant=is_relevant, explanation=explanation, context_used=context, - abstract_used=abstract + abstract_used=abstract, + citation_role=citation_role, ) def test_connection(self) -> bool: @@ -371,6 +518,7 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex } response = requests.post(url, json=payload, timeout=10) return response.status_code == 200 - except Exception: + except Exception as e: + logger.debug("LLM test_connection failed for %s: %s", self.backend.value, e) return False return False diff --git a/src/analyzers/metadata_comparator.py b/src/analyzers/metadata_comparator.py index 387a032655ff16bbdd2ce3dc5942d4b2cc6b836b..2108add7b4eaefc3e184289308325d12f48f0c2e 100644 --- a/src/analyzers/metadata_comparator.py +++ b/src/analyzers/metadata_comparator.py @@ -18,30 +18,41 @@ from ..utils.normalizer import TextNormalizer class ComparisonResult: """Result of comparing bib entry with fetched metadata.""" entry_key: str - + # Title comparison title_match: bool title_similarity: float bib_title: str fetched_title: str - + # Author comparison author_match: bool author_similarity: float bib_authors: list[str] fetched_authors: list[str] - + # Year comparison year_match: bool bib_year: str fetched_year: str - + # Overall assessment is_match: bool confidence: float issues: list[str] source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable' - + + # F4: When an arXiv preprint has a published counterpart, surface it here. + published_version_hint: str = "" # e.g. "Also published at NeurIPS 2024 (doi:10.1145/...)" + + # Positive / informational notes that should NOT be counted as issues + # (e.g. "corroborated by S2", "year differs by ≀1, treated as match"). + notes: list[str] = None # type: ignore[assignment] + + def __post_init__(self): + if self.notes is None: + self.notes = [] + @property def has_issues(self) -> bool: return len(self.issues) > 0 @@ -60,7 +71,17 @@ class MetadataComparator: def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult: """Compare bib entry with arXiv metadata.""" issues = [] - + + # F4: Extract a published-version hint if arXiv records it. + published_hint = "" + if arxiv_meta.journal_ref or arxiv_meta.doi: + parts = [] + if arxiv_meta.journal_ref: + parts.append(arxiv_meta.journal_ref.strip()) + if arxiv_meta.doi: + parts.append(f"doi:{arxiv_meta.doi.strip()}") + published_hint = "Has a published version β€” " + " | ".join(parts) + # Compare titles bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title) arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title) @@ -114,7 +135,8 @@ class MetadataComparator: is_match=is_match, confidence=confidence, issues=issues, - source="arxiv" + source="arxiv", + published_version_hint=published_hint, ) def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult: diff --git a/src/checkers/__init__.py b/src/checkers/__init__.py index 8dfa6f6c745d324ef8d6b1a4909050ae5fd07987..323b4eff2dc0c75f2381fc871713a8f3b8472bdd 100644 --- a/src/checkers/__init__.py +++ b/src/checkers/__init__.py @@ -11,6 +11,7 @@ from .consistency_checker import ConsistencyChecker from .citation_quality_checker import CitationQualityChecker from .equation_checker import EquationChecker from .acronym_checker import AcronymChecker +from .template_checker import TemplateChecker __all__ = [ 'BaseChecker', @@ -27,6 +28,7 @@ __all__ = [ 'CitationQualityChecker', 'EquationChecker', 'AcronymChecker', + 'TemplateChecker', ] @@ -43,6 +45,7 @@ CHECKER_REGISTRY = { 'citation_quality': CitationQualityChecker, 'equation': EquationChecker, 'acronym': AcronymChecker, + 'template': TemplateChecker, } diff --git a/src/checkers/__pycache__/__init__.cpython-313.pyc b/src/checkers/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index dfbadc876a0bde939483c7b0f82bf06d24750cbd..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/acronym_checker.cpython-313.pyc b/src/checkers/__pycache__/acronym_checker.cpython-313.pyc deleted file mode 100644 index 2ac22b6f83d4ca40144e2133b2c42c6080d78275..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/acronym_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc b/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc deleted file mode 100644 index b174f02eae34f07ad56e5e64a30638db94aa0a39..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc b/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc deleted file mode 100644 index 235ac14399e902e0cd1888e78a3ec1aca756822b..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/anonymization_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/base.cpython-313.pyc b/src/checkers/__pycache__/base.cpython-313.pyc deleted file mode 100644 index 5f8a48378cd6d7d880c2ea0c1635ab2d266402df..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/base.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/caption_checker.cpython-313.pyc b/src/checkers/__pycache__/caption_checker.cpython-313.pyc deleted file mode 100644 index ded8600e65ad18259c7d969ba6d3a6a88b64c677..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/caption_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc b/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc deleted file mode 100644 index 0403cd564b3cb9927f6b0f05c6c51afb8e213a54..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/consistency_checker.cpython-313.pyc b/src/checkers/__pycache__/consistency_checker.cpython-313.pyc deleted file mode 100644 index 3c64657741db59e4627caae647a1ff9e8a725492..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/consistency_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/equation_checker.cpython-313.pyc b/src/checkers/__pycache__/equation_checker.cpython-313.pyc deleted file mode 100644 index d598e60c925c8e5d0c4505d3f97909dbe9e4e818..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/equation_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/formatting_checker.cpython-313.pyc b/src/checkers/__pycache__/formatting_checker.cpython-313.pyc deleted file mode 100644 index b2aaa1357a5a6800b184ae2f5b6dcae7c2df51cb..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/formatting_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/number_checker.cpython-313.pyc b/src/checkers/__pycache__/number_checker.cpython-313.pyc deleted file mode 100644 index af09ad160e7cf0d63d4cd3685ca98973e9175fa0..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/number_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/reference_checker.cpython-313.pyc b/src/checkers/__pycache__/reference_checker.cpython-313.pyc deleted file mode 100644 index c02ac2fa5d31c58c4e9909fecfb0616cd0fca9d9..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/reference_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/__pycache__/sentence_checker.cpython-313.pyc b/src/checkers/__pycache__/sentence_checker.cpython-313.pyc deleted file mode 100644 index 00cf2c808194adcccaa032ee526542c5715e25ca..0000000000000000000000000000000000000000 Binary files a/src/checkers/__pycache__/sentence_checker.cpython-313.pyc and /dev/null differ diff --git a/src/checkers/acronym_checker.py b/src/checkers/acronym_checker.py index b88ea59ac78f44361d6abcb4d626d08d89d105f2..263e56acc8697e7b29a6fdc199a64e221c5362f5 100644 --- a/src/checkers/acronym_checker.py +++ b/src/checkers/acronym_checker.py @@ -87,23 +87,30 @@ class AcronymChecker(BaseChecker): } def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + config = config or {} results = [] - + + # Project glossary: skip-set + auto-defined map + user_acronyms = dict(config.get('glossary_acronyms', {}) or {}) + # All user-supplied acronyms are considered "known/defined" β€” never warn about them. + glossary_skip = {k.upper() for k in user_acronyms.keys()} + common_plus_glossary = self.COMMON_ACRONYMS | glossary_skip + # Remove comments using base class method content = self._remove_comments(tex_content) - + # Find all defined acronyms with their positions defined_acronyms = self._find_definitions(content) - + # Find all acronym usages (excluding special contexts) all_usages = self._find_all_usages(content) - + # NEW: Find potential full forms for each acronym acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys()) - + # Check for undefined acronyms (only those with matching full forms) for acronym, positions in all_usages.items(): - if acronym in self.COMMON_ACRONYMS: + if acronym in common_plus_glossary: continue # Skip if no matching full form found in document diff --git a/src/checkers/ai_artifacts_checker.py b/src/checkers/ai_artifacts_checker.py index 209af561a41e25d1d9f617bfc3e7ee9ad1ba9205..a4c612dd27a869cf23b8950a53515fac8e6bb240 100644 --- a/src/checkers/ai_artifacts_checker.py +++ b/src/checkers/ai_artifacts_checker.py @@ -125,7 +125,7 @@ class AIArtifactsChecker(BaseChecker): severity=CheckSeverity.ERROR, message=f"{description} detected", line_number=line_num, - line_content=line.strip()[:100], + line_content=line.strip(), suggestion="Remove AI-generated conversational text" )) break # One match per line for this category @@ -139,7 +139,7 @@ class AIArtifactsChecker(BaseChecker): severity=CheckSeverity.WARNING, message=f"{description}: '{match.group(0)[:50]}'", line_number=line_num, - line_content=line.strip()[:100], + line_content=line.strip(), suggestion="Replace placeholder with actual content or remove" )) @@ -169,7 +169,7 @@ class AIArtifactsChecker(BaseChecker): severity=CheckSeverity.INFO, message=f"Possible {description} in LaTeX", line_number=line_num, - line_content=line.strip()[:100], + line_content=line.strip(), suggestion="Convert to LaTeX formatting or remove if unintentional" )) diff --git a/src/checkers/anonymization_checker.py b/src/checkers/anonymization_checker.py index 8c17e53510ffc4954b128a355b79ec95c37b727c..7596a19d7cc5583f005db6971df1973f042c2521 100644 --- a/src/checkers/anonymization_checker.py +++ b/src/checkers/anonymization_checker.py @@ -79,7 +79,7 @@ class AnonymizationChecker(BaseChecker): severity=CheckSeverity.WARNING, message=f"{desc} in comment (could be revealed when compiling)", line_number=line_num, - line_content=line.strip()[:100], + line_content=line.strip(), suggestion="Remove or anonymize URL even in comments" )) continue @@ -91,7 +91,7 @@ class AnonymizationChecker(BaseChecker): severity=CheckSeverity.ERROR, message=f"{desc} may reveal author identity", line_number=line_num, - line_content=line.strip()[:100], + line_content=line.strip(), suggestion="Replace with anonymized URL or remove for review" )) @@ -112,7 +112,7 @@ class AnonymizationChecker(BaseChecker): severity=CheckSeverity.WARNING, message="Potentially self-revealing citation pattern", line_number=line_num, - line_content=line.strip()[:100], + line_content=line.strip(), suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')" )) diff --git a/src/checkers/base.py b/src/checkers/base.py index de8caf1d73fc485eb2a92532bd41ce8efd9539fc..c934f74fc7a19ca228648ac04eba2f8fa78e4024 100644 --- a/src/checkers/base.py +++ b/src/checkers/base.py @@ -29,7 +29,10 @@ class CheckResult: line_content: Optional[str] = None suggestion: Optional[str] = None file_path: Optional[str] = None - + # Substring of line_content that triggered the issue. The renderer wraps + # this in so the user can see *where* in the line to look. + match_text: Optional[str] = None + def to_dict(self) -> dict: return { 'checker': self.checker_name, @@ -39,7 +42,8 @@ class CheckResult: 'line': self.line_number, 'content': self.line_content, 'suggestion': self.suggestion, - 'file_path': self.file_path + 'file_path': self.file_path, + 'match_text': self.match_text, } @@ -178,7 +182,8 @@ class BaseChecker(ABC): message: str, line_number: Optional[int] = None, line_content: Optional[str] = None, - suggestion: Optional[str] = None + suggestion: Optional[str] = None, + match_text: Optional[str] = None, ) -> CheckResult: """Helper to create a CheckResult with this checker's name.""" return CheckResult( @@ -188,6 +193,7 @@ class BaseChecker(ABC): message=message, line_number=line_number, line_content=line_content, - suggestion=suggestion + suggestion=suggestion, + match_text=match_text, ) diff --git a/src/checkers/citation_quality_checker.py b/src/checkers/citation_quality_checker.py index de29ca2900bfd242adf368f33c1026595c10eb8f..d5db9525dd16932210310ea066623bd680cff5bc 100644 --- a/src/checkers/citation_quality_checker.py +++ b/src/checkers/citation_quality_checker.py @@ -124,7 +124,7 @@ class CitationQualityChecker(BaseChecker): severity=CheckSeverity.WARNING, message="Appears to be hardcoded citation instead of \\cite", line_number=line_num, - line_content=line.strip()[:80], + line_content=line.strip(), suggestion="Use \\cite{} for proper bibliography management" )) diff --git a/src/checkers/consistency_checker.py b/src/checkers/consistency_checker.py index 14849b9b71ed715864b33b0fcda2edfd0f99935d..886a3721613c6db4828315443359c82804f4b324 100644 --- a/src/checkers/consistency_checker.py +++ b/src/checkers/consistency_checker.py @@ -149,25 +149,45 @@ class ConsistencyChecker(BaseChecker): ] def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: + config = config or {} results = [] - + # Remove comments content = re.sub(r'(? 1: results.append(self._create_result( passed=False, diff --git a/src/checkers/formatting_checker.py b/src/checkers/formatting_checker.py index ffcc3e9df645e870322ab1879e6a70bcb1c4ae1b..c20c2187b520473ac5c1f8f17538857560d7dac0 100644 --- a/src/checkers/formatting_checker.py +++ b/src/checkers/formatting_checker.py @@ -41,9 +41,6 @@ class FormattingChecker(BaseChecker): '^': r'(? List[CheckResult]: results = [] lines = tex_content.split('\n') @@ -66,8 +63,9 @@ class FormattingChecker(BaseChecker): severity=CheckSeverity.INFO, message="Citation without non-breaking space", line_number=line_num, - line_content=line.strip()[:100], - suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')" + line_content=line.strip(), + suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')", + match_text=match.group(0), )) # Track citation styles @@ -90,40 +88,6 @@ class FormattingChecker(BaseChecker): suggestion="Consider using consistent citation style throughout" )) - # Check for multiple blank lines (3 or more) - for match in self.MULTI_BLANK_PATTERN.finditer(tex_content): - line_num = self._find_line_number(tex_content, match.start()) - # Count how many blank lines - blank_count = match.group(0).count('\n') - 1 - - # Get context: the line before, blank lines, and the line after - start_pos = match.start() - end_pos = match.end() - - # Find the line before the blank lines - prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1 - prev_line_end = start_pos - prev_line = tex_content[prev_line_start:prev_line_end].rstrip() - - # Find the line after the blank lines - next_line_end = tex_content.find('\n', end_pos) - if next_line_end == -1: - next_line_end = len(tex_content) - next_line = tex_content[end_pos:next_line_end].rstrip() - - # Create visual representation with warning markers - blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count) - line_content = f"{prev_line}\n{blank_lines}\n{next_line}" - - results.append(self._create_result( - passed=False, - severity=CheckSeverity.INFO, - message=f"Multiple blank lines ({blank_count} consecutive blank lines)", - line_number=line_num, - line_content=line_content, - suggestion="Reduce to single blank line or use \\vspace" - )) - # Check for common issues with special characters results.extend(self._check_special_chars(tex_content, lines)) @@ -159,8 +123,9 @@ class FormattingChecker(BaseChecker): severity=CheckSeverity.WARNING, message="Unescaped & outside tabular/math environment", line_number=line_num, - line_content=line.strip()[:100], - suggestion="Use \\& to escape" + line_content=line.strip(), + suggestion="Use \\& to escape", + match_text=match.group(0), )) return results diff --git a/src/checkers/retraction_checker.py b/src/checkers/retraction_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..54a8a539b747797d1a5d9386f64e6b84cebee1f4 --- /dev/null +++ b/src/checkers/retraction_checker.py @@ -0,0 +1,53 @@ +""" +Bibliography-level checker that flags retracted DOIs. + +Unlike the LaTeX-line checkers in src/checkers/, this one operates on parsed +BibEntry objects, not on a tex_content string. main.py / app.py invoke it +directly via `check_entries(entries)`. +""" +from __future__ import annotations + +import concurrent.futures +import logging +from dataclasses import dataclass +from typing import Iterable, List + +from src.fetchers.retraction_fetcher import RetractionFetcher, RetractionResult +from src.parsers.bib_parser import BibEntry + +logger = logging.getLogger(__name__) + + +@dataclass +class RetractionFinding: + entry_key: str + doi: str + result: RetractionResult + + +class RetractionChecker: + """Concurrent batch retraction lookup.""" + + def __init__(self, max_workers: int = 6): + self.fetcher = RetractionFetcher() + self.max_workers = max_workers + + def check_entries(self, entries: Iterable[BibEntry]) -> List[RetractionFinding]: + """Look up retraction status for every entry that has a DOI.""" + with_doi = [e for e in entries if getattr(e, "doi", "")] + if not with_doi: + return [] + + findings: List[RetractionFinding] = [] + + def _one(entry: BibEntry): + res = self.fetcher.check(entry.doi) + return entry, res + + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as ex: + for entry, res in ex.map(_one, with_doi): + if res is None: + continue + if res.is_retracted or res.update_type: + findings.append(RetractionFinding(entry.key, entry.doi, res)) + return findings diff --git a/src/checkers/sentence_checker.py b/src/checkers/sentence_checker.py index f5dd952cd6e852ad08675237fce20d9c85cf2c32..11df277c32902017bd00ee79f5077ed66356b5dd 100644 --- a/src/checkers/sentence_checker.py +++ b/src/checkers/sentence_checker.py @@ -76,7 +76,7 @@ class SentenceChecker(BaseChecker): severity=CheckSeverity.INFO, message=message, line_number=line_num, - line_content=line.strip()[:80] + line_content=line.strip() )) break # One per line diff --git a/src/checkers/template_checker.py b/src/checkers/template_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..9be041ef7212bf42eab75f3be469ed0577e1e8a5 --- /dev/null +++ b/src/checkers/template_checker.py @@ -0,0 +1,393 @@ +""" +Conference-template conformance checker. + +Reads the rich rule set defined in :mod:`src.templates.base_template` and runs +per-venue checks against the LaTeX source. Each rule fragment lives in its own +small private method so adding new conferences (or new rules) doesn't bloat the +public ``check`` method. + +Severity convention used here: + +* ``ERROR`` β€” desk-reject material if uncorrected (NeurIPS missing checklist, + ACL missing Limitations, double-blind \\author leak). +* ``WARNING`` β€” likely a real problem but might be a false positive (style + package mismatch, identifying URL). +* ``INFO`` β€” soft reminder that something MUST happen later (camera-ready + sections, lay summaries, font requirements, page-limit + estimation that the .tex source can't actually verify). +""" +from __future__ import annotations + +import re +from typing import List, Optional + +from .base import BaseChecker, CheckResult, CheckSeverity + + +# ------------------------------------------------------------------ helpers --- + +# Match \section{X}, \subsection{X}, \paragraph{X}, optionally starred, +# allowing an optional [short] argument before the {body}. +def _section_pattern(name: str) -> re.Pattern: + return re.compile( + r'\\(?:section|subsection|paragraph)\*?\s*(?:\[[^\]]*\])?\s*\{[^}]*?' + + re.escape(name) + r'[^}]*\}', + re.IGNORECASE, + ) + + +# Domains/URL patterns that strongly de-anonymize an author. Whitelisted +# domains (which legitimately appear in CV/ML papers without leaking identity) +# are excluded. +_IDENTIFYING_URL_PATTERNS = [ + re.compile(r'\bgithub\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE), + re.compile(r'\b[A-Za-z0-9_\-]+\.github\.io\b', re.IGNORECASE), + re.compile(r'\bgitlab\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE), + re.compile(r'\bbitbucket\.org/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE), + re.compile(r'\b(?:huggingface\.co|wandb\.ai)/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE), + re.compile(r'\b(?:linkedin|twitter|x)\.com/[A-Za-z0-9_\-]+', re.IGNORECASE), +] + +# URLs that are explicitly anonymous-friendly and should NOT be flagged. +_ANONYMOUS_URL_HINTS = re.compile( + r'(anonymous|anon|blind|review|submission|4open\.science)', re.IGNORECASE, +) + +# Capture URLs from \url{...}, \href{...}{...}, and bare http(s)://... +_URL_FROM_TEX = re.compile( + r'\\(?:url|href)\s*\{([^}]+)\}|(? List[CheckResult]: + config = config or {} + template = config.get("template") + if template is None: + return [] + + content = self._remove_comments(tex_content) + results: List[CheckResult] = [] + + self._check_mandatory_sections(template, content, results) + self._check_camera_only_sections(template, content, results) + self._check_style_package(template, content, results) + self._check_doc_class(template, content, results) + self._check_paper_size(template, content, results) + + if template.double_blind: + self._check_double_blind_author(template, content, results) + if template.forbid_identifying_urls: + self._check_identifying_urls(template, content, results) + if template.forbid_acks_in_review: + self._check_acknowledgments(template, content, results) + + if template.requires_paper_checklist: + self._check_paper_checklist(template, content, results) + if template.requires_reproducibility_statement: + self._check_reproducibility_statement(template, content, results) + if template.requires_lay_summary_camera: + self._inform_lay_summary(template, results) + if template.requires_type1_fonts: + self._inform_type1_fonts(template, results) + if template.min_main_pages > 0: + self._inform_min_pages(template, results) + + if "Limitations" in template.mandatory_sections: + self._check_limitations_content(template, content, results) + + return results + + # ============================================================== sections == + + def _check_mandatory_sections(self, template, content: str, results: List[CheckResult]): + for section in template.mandatory_sections or []: + if not _section_pattern(section).search(content): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message=f"[{template.name}] Missing mandatory section: '{section}'", + suggestion=f"Add `\\section{{{section}}}` (required by {template.name}).", + )) + + def _check_camera_only_sections(self, template, content: str, results: List[CheckResult]): + for section in template.mandatory_camera_sections or []: + if not _section_pattern(section).search(content): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=( + f"[{template.name}] Camera-ready section '{section}' not found. " + "Required for the camera-ready version, optional for review." + ), + suggestion=f"Add `\\section{{{section}}}` before References for camera-ready.", + )) + + # =================================================== style / typesetting == + + def _check_style_package(self, template, content: str, results: List[CheckResult]): + pkg = (template.style_package or "").strip() + if not pkg: + return + pkg_re = re.compile( + r'\\(?:usepackage|documentclass)(?:\[[^\]]*\])?\s*\{\s*' + + re.escape(pkg) + r'\s*\}' + ) + if not pkg_re.search(content): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=( + f"[{template.name}] Style package '{pkg}' not found. " + "If you really are submitting to this venue, your template may be wrong." + ), + suggestion=f"Use the official `{pkg}` style package.", + )) + + def _check_doc_class(self, template, content: str, results: List[CheckResult]): + wanted = (template.doc_class or "").strip() + if not wanted: + return + m = _DOCCLASS_RE.search(content) + actual = m.group(2).strip() if m else "" + if actual.lower() != wanted.lower(): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=( + f"[{template.name}] Expected `\\documentclass{{{wanted}}}`, " + f"found `{actual or 'none'}`." + ), + suggestion=f"Use the official document class `{wanted}` (Springer LNCS for ECCV).", + )) + + def _check_paper_size(self, template, content: str, results: List[CheckResult]): + wanted = (template.paper_size or "").lower() + if wanted not in {"letter", "a4"}: + return + m = _DOCCLASS_RE.search(content) + if not m: + return + opts = (m.group(1) or "").lower() + actual = None + if "letterpaper" in opts or "letter" in opts: + actual = "letter" + elif "a4paper" in opts or "a4" in opts: + actual = "a4" + if actual and actual != wanted: + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=( + f"[{template.name}] Expected paper size '{wanted}', " + f"document class is set to '{actual}'." + ), + suggestion=f"Use `\\documentclass[{wanted}paper]{{...}}`.", + )) + + # ================================================================ blinding = + + def _check_double_blind_author(self, template, content: str, results: List[CheckResult]): + m = re.search(r'\\author\s*(?:\[[^\]]*\])?\s*\{([^}]*)\}', content) + if not m: + return + body = m.group(1) + if not body.strip(): + return + if re.search(r'(anonymous|hidden|blind|submission)', body, re.IGNORECASE): + return + line_num = self._find_line_number(content, m.start()) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message=f"[{template.name}] Double-blind: \\author appears to contain identifying info", + line_number=line_num, + line_content=body.strip(), + suggestion=r"Replace \author with anonymous placeholder during review.", + )) + + def _check_identifying_urls(self, template, content: str, results: List[CheckResult]): + for m in _URL_FROM_TEX.finditer(content): + url = (m.group(1) or m.group(2) or "").strip() + if not url: + continue + if _ANONYMOUS_URL_HINTS.search(url): + continue + for pat in _IDENTIFYING_URL_PATTERNS: + if pat.search(url): + line_num = self._find_line_number(content, m.start()) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=( + f"[{template.name}] Possible identifying URL during double-blind review: " + f"{url[:120]}" + ), + line_number=line_num, + line_content=url, + suggestion=( + "Use Anonymous GitHub (https://anonymous.4open.science) or remove " + "the link until the camera-ready version." + ), + )) + break # one finding per URL + + def _check_acknowledgments(self, template, content: str, results: List[CheckResult]): + for pat in _ACK_PATTERNS: + m = pat.search(content) + if m: + line_num = self._find_line_number(content, m.start()) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=( + f"[{template.name}] Acknowledgments section detected; " + f"{template.short_name.upper()} requires omitting it during review." + ), + line_number=line_num, + suggestion=( + "Comment out or wrap acks in `\\if...\\fi` so they only " + "appear in the camera-ready version." + ), + )) + return # one finding is enough + + # ============================================== per-venue special items === + + def _check_paper_checklist(self, template, content: str, results: List[CheckResult]): + for pat in _NEURIPS_CHECKLIST_PATTERNS: + if pat.search(content): + return + results.append(self._create_result( + passed=False, + severity=CheckSeverity.ERROR, + message=( + f"[{template.name}] NeurIPS Paper Checklist not found. " + "NeurIPS desk-rejects submissions without the checklist." + ), + suggestion=( + "Add `\\input{neurips_paper_checklist}` (or paste the official template) " + "after References / supplementary." + ), + )) + + def _check_reproducibility_statement(self, template, content: str, results: List[CheckResult]): + if _REPRO_SECTION.search(content): + return + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=( + f"[{template.name}] Reproducibility Statement not found. " + "It's encouraged (~1 page) and does not count toward the page limit." + ), + suggestion=( + "Add `\\section*{Reproducibility Statement}` before References summarizing " + "code/data/seeds/hyperparameter availability." + ), + )) + + def _inform_lay_summary(self, template, results: List[CheckResult]): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=( + f"[{template.name}] Lay summary required at camera-ready time " + "(plain-language summary submitted via OpenReview)." + ), + suggestion="Draft a 1–2 paragraph plain-language summary now to avoid a last-minute scramble.", + )) + + def _inform_type1_fonts(self, template, results: List[CheckResult]): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=( + f"[{template.name}] Embedded fonts must be Type-1 only β€” verify with " + "`pdffonts `. Cannot be checked from .tex source alone." + ), + suggestion="Compile with `pdflatex` (not XeLaTeX/LuaLaTeX) and convert any Type-3 fonts.", + )) + + def _inform_min_pages(self, template, results: List[CheckResult]): + results.append(self._create_result( + passed=False, + severity=CheckSeverity.INFO, + message=( + f"[{template.name}] Main text must be at least {template.min_main_pages} pages " + f"and at most {template.page_limit_review} pages. Cannot be measured from source." + ), + suggestion=( + f"Compile and confirm the rendered PDF stays within " + f"{template.min_main_pages}–{template.page_limit_review} pages of main text." + ), + )) + + # ============================================ ACL family: Limitations rule + + def _check_limitations_content(self, template, content: str, results: List[CheckResult]): + # Find the Limitations section span up to the next \section or end of doc. + m = re.search( + r'(\\section\*?\s*(?:\[[^\]]*\])?\s*\{[^}]*Limitations[^}]*\})', + content, re.IGNORECASE, + ) + if not m: + return # mandatory_sections check already flagged absence + start = m.end() + nxt = re.search(r'\\section\*?\s*\{', content[start:], re.IGNORECASE) + end = start + nxt.start() if nxt else len(content) + section_body = content[start:end] + # Discussion-only rule: no floats, no nested \section + if _FLOAT_OR_NEW_SECTION_RE.search(section_body): + line_num = self._find_line_number(content, start) + results.append(self._create_result( + passed=False, + severity=CheckSeverity.WARNING, + message=( + f"[{template.name}] Limitations section appears to contain floats or a " + "nested section. ACL/EMNLP/NAACL require Limitations to be discussion only." + ), + line_number=line_num, + suggestion=( + "Move tables/figures/algorithms out of Limitations into the main body or " + "appendix; Limitations should be prose-only." + ), + )) diff --git a/src/checkers/url_checker.py b/src/checkers/url_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..5997a5747f477d46960383aba3490df6df9abfc4 --- /dev/null +++ b/src/checkers/url_checker.py @@ -0,0 +1,80 @@ +""" +URL liveness checker for bibliography entries. + +Many @misc / blog / repo references rot over time. This checker does a HEAD +(falling back to a small GET) on entry.url and flags anything that returns +4xx/5xx or fails to connect. + +Operates on BibEntry objects, not on tex_content. Invoked from main.py / app.py +when `submission_extra.url_liveness` is true. +""" +from __future__ import annotations + +import concurrent.futures +import logging +from dataclasses import dataclass +from typing import Iterable, List, Optional + +import requests + +from src.utils.http import get_session +from src.parsers.bib_parser import BibEntry + +logger = logging.getLogger(__name__) + + +@dataclass +class URLFinding: + entry_key: str + url: str + status: str # "ok" | "broken" | "unreachable" | "skipped" + status_code: Optional[int] = None + detail: str = "" + + +class URLChecker: + """Concurrent HEAD-then-GET liveness check.""" + + SKIP_PREFIXES = ("mailto:", "ftp://", "tel:", "javascript:") + + def __init__(self, max_workers: int = 8, timeout: float = 15.0): + self.max_workers = max_workers + self.timeout = timeout + + def _check_one(self, entry: BibEntry) -> Optional[URLFinding]: + url = (entry.url or "").strip() + if not url: + return None + if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES): + return URLFinding(entry.key, url, "skipped", detail="non-http scheme") + + session = get_session() + try: + r = session.head(url, allow_redirects=True, timeout=self.timeout) + # Many servers return 405/403 for HEAD but are fine with GET; double-check with a tiny GET. + if r.status_code in (403, 405, 501): + r = session.get(url, allow_redirects=True, timeout=self.timeout, stream=True) + # Don't actually read the body + r.close() + except requests.RequestException as e: + logger.debug("URL check failed for %s: %s", url, e, exc_info=True) + return URLFinding(entry.key, url, "unreachable", detail=str(e)[:120]) + + if 200 <= r.status_code < 400: + return URLFinding(entry.key, url, "ok", status_code=r.status_code) + return URLFinding( + entry.key, url, "broken", + status_code=r.status_code, + detail=f"HTTP {r.status_code}", + ) + + def check_entries(self, entries: Iterable[BibEntry]) -> List[URLFinding]: + targets = [e for e in entries if getattr(e, "url", "")] + if not targets: + return [] + findings: List[URLFinding] = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as ex: + for f in ex.map(self._check_one, targets): + if f is not None: + findings.append(f) + return findings diff --git a/src/config/__pycache__/__init__.cpython-313.pyc b/src/config/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index f34a85f65061f1f6901383f2c3427098a0c19e75..0000000000000000000000000000000000000000 Binary files a/src/config/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/config/__pycache__/workflow.cpython-313.pyc b/src/config/__pycache__/workflow.cpython-313.pyc deleted file mode 100644 index 0b55e19e08f6821df1f5d3550a2332b5ce0226ca..0000000000000000000000000000000000000000 Binary files a/src/config/__pycache__/workflow.cpython-313.pyc and /dev/null differ diff --git a/src/config/__pycache__/yaml_config.cpython-313.pyc b/src/config/__pycache__/yaml_config.cpython-313.pyc deleted file mode 100644 index 439b18be73390a7bb9137472d739bbc5a36de835..0000000000000000000000000000000000000000 Binary files a/src/config/__pycache__/yaml_config.cpython-313.pyc and /dev/null differ diff --git a/src/config/yaml_config.py b/src/config/yaml_config.py index fe34964e7917246d6487797c95f802c42a9bc1f1..5b8a29244432d4766f54a9596138e094e9928e50 100644 --- a/src/config/yaml_config.py +++ b/src/config/yaml_config.py @@ -97,11 +97,36 @@ class LLMConfig: api_key: str = "" -@dataclass +@dataclass class OutputConfig: """Output configuration.""" quiet: bool = False minimal_verified: bool = False + formats: List[str] = field(default_factory=lambda: ["markdown", "html"]) # markdown, html, json + + +@dataclass +class NetworkConfig: + """Network / politeness configuration.""" + contact_email: str = "" + cache_enabled: bool = True + cache_ttl_hours: int = 24 + retry_total: int = 5 + retry_backoff_factor: float = 1.5 + + +@dataclass +class GlossaryConfig: + """User-supplied project glossary for ConsistencyChecker / AcronymChecker.""" + preferred: List[str] = field(default_factory=list) # e.g. ["Transformer", "fine-tuning"] + acronyms: Dict[str, str] = field(default_factory=dict) # e.g. {"NLP": "Natural Language Processing"} + + +@dataclass +class SubmissionExtraConfig: + """Extra submission checks added on top of the original list.""" + url_liveness: bool = False + retraction: bool = True @dataclass @@ -111,9 +136,12 @@ class BibGuardConfig: template: str = "" bibliography: BibliographyConfig = field(default_factory=BibliographyConfig) submission: SubmissionConfig = field(default_factory=SubmissionConfig) + submission_extra: SubmissionExtraConfig = field(default_factory=SubmissionExtraConfig) workflow: List[WorkflowStep] = field(default_factory=list) llm: LLMConfig = field(default_factory=LLMConfig) output: OutputConfig = field(default_factory=OutputConfig) + network: NetworkConfig = field(default_factory=NetworkConfig) + glossary: GlossaryConfig = field(default_factory=GlossaryConfig) # Internal fields to store discovered files in directory mode _bib_files: List[Path] = field(default_factory=list) @@ -225,11 +253,48 @@ def load_config(config_path: str) -> BibGuardConfig: # Parse output section if 'output' in data: out = data['output'] + formats = out.get('formats', ["markdown", "html"]) + if isinstance(formats, str): + formats = [f.strip() for f in formats.split(",") if f.strip()] config.output = OutputConfig( quiet=out.get('quiet', False), - minimal_verified=out.get('minimal_verified', False) + minimal_verified=out.get('minimal_verified', False), + formats=list(formats), ) - + + # Parse network section + if 'network' in data: + net = data['network'] or {} + config.network = NetworkConfig( + contact_email=net.get('contact_email', ''), + cache_enabled=bool(net.get('cache_enabled', True)), + cache_ttl_hours=int(net.get('cache_ttl_hours', 24)), + retry_total=int(net.get('retry_total', 5)), + retry_backoff_factor=float(net.get('retry_backoff_factor', 1.5)), + ) + + # Parse glossary section + if 'glossary' in data: + g = data['glossary'] or {} + preferred = g.get('preferred', []) or [] + acronyms = g.get('acronyms', {}) or {} + if not isinstance(preferred, list): + preferred = [str(preferred)] + if not isinstance(acronyms, dict): + acronyms = {} + config.glossary = GlossaryConfig( + preferred=[str(x) for x in preferred], + acronyms={str(k): str(v) for k, v in acronyms.items()}, + ) + + # Parse submission_extra section (URL liveness, retraction) + if 'submission_extra' in data: + sx = data['submission_extra'] or {} + config.submission_extra = SubmissionExtraConfig( + url_liveness=bool(sx.get('url_liveness', False)), + retraction=bool(sx.get('retraction', True)), + ) + return config @@ -264,6 +329,15 @@ files: template: "" +network: + # Real email used in polite-pool User-Agents (arXiv/CrossRef/OpenAlex). + # Strongly recommended. + contact_email: "" + cache_enabled: true # Local SQLite cache for HTTP responses + cache_ttl_hours: 24 + retry_total: 5 + retry_backoff_factor: 1.5 + bibliography: check_metadata: true check_usage: true @@ -285,16 +359,27 @@ submission: citation_quality: true anonymization: true +submission_extra: + url_liveness: false # HEAD-check every entry.url field (slow, off by default) + retraction: true # Flag retracted DOIs via CrossRef + +# Project-specific glossary helps ConsistencyChecker and AcronymChecker +# avoid false positives and enforce house style. +glossary: + preferred: [] # e.g. ["Transformer", "fine-tuning"] + acronyms: {} # e.g. {NLP: "Natural Language Processing"} + llm: - backend: "gemini" - model: "" - api_key: "" + backend: "gemini" # gemini | openai | anthropic | deepseek | ollama | vllm + model: "" # leave empty for sensible default per backend + api_key: "" # prefer env var _API_KEY output: quiet: false minimal_verified: false + formats: [markdown, html] # any of: markdown, html, json """ with open(output_path, 'w', encoding='utf-8') as f: f.write(default) - + return output_path diff --git a/src/fetchers/__pycache__/__init__.cpython-313.pyc b/src/fetchers/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index d2c8c41f22daf3f9ebe4906fbf377db97bdda5b9..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc deleted file mode 100644 index d6c0f6d2bea0f0fc0d4eaa6cc21db0bf88d23ea0..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/arxiv_fetcher.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc deleted file mode 100644 index 308f1616aefb91ef39445c7ffa769df645c0278d..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/crossref_fetcher.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc deleted file mode 100644 index 0e6c4c3048b8dbe934a6601fd1d5db5ab66f94e1..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/dblp_fetcher.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc deleted file mode 100644 index c22735c52179987deacb4a0647d9c24b78709ed7..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/openalex_fetcher.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc deleted file mode 100644 index 9a5aef1df4d19d55bf6834cf0e7f368896ea8c9b..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/scholar_fetcher.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc b/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc deleted file mode 100644 index 089ed166dfaaf59e5fc7b54b7f45a9f3e2b2b744..0000000000000000000000000000000000000000 Binary files a/src/fetchers/__pycache__/semantic_scholar_fetcher.cpython-313.pyc and /dev/null differ diff --git a/src/fetchers/arxiv_fetcher.py b/src/fetchers/arxiv_fetcher.py index 22a8a064ebe5cf07f60a641d41c7fa47193684a4..1c8c0d686eb4e6516dd3efb73fac6632f4c9ee9a 100644 --- a/src/fetchers/arxiv_fetcher.py +++ b/src/fetchers/arxiv_fetcher.py @@ -1,6 +1,7 @@ """ arXiv metadata fetcher using the public API. """ +import logging import re import time import xml.etree.ElementTree as ET @@ -10,6 +11,11 @@ from urllib.parse import quote import requests +from src.utils.http import get_session, is_open, record_failure, record_success + +logger = logging.getLogger(__name__) +_SOURCE = "arxiv" + @dataclass class ArxivMetadata: @@ -55,60 +61,53 @@ class ArxivFetcher: self._last_request_time = time.time() def fetch_by_id(self, arxiv_id: str) -> Optional[ArxivMetadata]: - """Fetch metadata by arXiv ID.""" - # Clean up ID + """Fetch metadata by arXiv ID. Honors circuit breaker.""" + if is_open(_SOURCE): + return None arxiv_id = arxiv_id.strip() arxiv_id = re.sub(r'^arXiv:', '', arxiv_id, flags=re.IGNORECASE) - + self._rate_limit() - - params = { - 'id_list': arxiv_id, - 'max_results': 1 - } - + + params = {'id_list': arxiv_id, 'max_results': 1} + try: - response = requests.get( - self.API_BASE, - params=params, - timeout=30, - headers={'User-Agent': 'BibChecker/1.0 (mailto:user@example.com)'} - ) + response = get_session().get(self.API_BASE, params=params, timeout=12) response.raise_for_status() + record_success(_SOURCE) except requests.RequestException as e: + logger.debug("arXiv fetch_by_id(%s) failed: %s", arxiv_id, e, exc_info=True) + record_failure(_SOURCE) return None - + return self._parse_response(response.text) def search_by_title(self, title: str, max_results: int = 5) -> list[ArxivMetadata]: - """Search arXiv by title.""" + """Search arXiv by title. Honors circuit breaker.""" + if is_open(_SOURCE): + return [] self._rate_limit() - - # Clean up title for search + clean_title = re.sub(r'[^\w\s]', ' ', title) clean_title = re.sub(r'\s+', ' ', clean_title).strip() - - # Build search query search_query = f'ti:"{clean_title}"' - + params = { 'search_query': search_query, 'max_results': max_results, 'sortBy': 'relevance', 'sortOrder': 'descending' } - + try: - response = requests.get( - self.API_BASE, - params=params, - timeout=30, - headers={'User-Agent': 'BibChecker/1.0 (mailto:user@example.com)'} - ) + response = get_session().get(self.API_BASE, params=params, timeout=12) response.raise_for_status() + record_success(_SOURCE) except requests.RequestException as e: + logger.debug("arXiv search_by_title(%s) failed: %s", title[:60], e, exc_info=True) + record_failure(_SOURCE) return [] - + return self._parse_response_multiple(response.text) def _parse_response(self, xml_content: str) -> Optional[ArxivMetadata]: diff --git a/src/fetchers/crossref_fetcher.py b/src/fetchers/crossref_fetcher.py index 1a65cc55b07c2209eb26c8c66958dfc987355643..6906b51aa8f36d8e9c7f4d8f5478afbe650b6816 100644 --- a/src/fetchers/crossref_fetcher.py +++ b/src/fetchers/crossref_fetcher.py @@ -4,11 +4,17 @@ CrossRef API fetcher for bibliography metadata. CrossRef provides free, reliable access to metadata for academic publications. No API key required, no rate limiting for reasonable use. """ +import logging import requests from dataclasses import dataclass from typing import Optional, List import time +from src.utils.http import get_session, is_open, record_failure, record_success + +logger = logging.getLogger(__name__) +_SOURCE = "crossref" + @dataclass class CrossRefResult: @@ -34,107 +40,99 @@ class CrossRefFetcher: BASE_URL = "https://api.crossref.org/works" RATE_LIMIT_DELAY = 1.0 # Be polite - def __init__(self, mailto: str = "bibguard@example.com"): + def __init__(self, mailto: Optional[str] = None): """ Initialize CrossRef fetcher. - - Args: - mailto: Email for polite pool (gets better rate limits) + + The shared HTTP session already carries a polite-pool User-Agent built + from `network.contact_email`. `mailto` here is kept for backward + compatibility but no longer overrides the session header. """ self.mailto = mailto self._last_request_time = 0.0 - self._session = requests.Session() - + def _rate_limit(self): """Ensure rate limiting between requests.""" elapsed = time.time() - self._last_request_time if elapsed < self.RATE_LIMIT_DELAY: time.sleep(self.RATE_LIMIT_DELAY - elapsed) self._last_request_time = time.time() - + def _get_headers(self) -> dict: - """Get request headers with mailto for polite pool.""" - return { - 'User-Agent': f'BibGuard/1.0 (mailto:{self.mailto})', - 'Accept': 'application/json', - } + return {'Accept': 'application/json'} def search_by_title(self, title: str, max_results: int = 5) -> Optional[CrossRefResult]: - """ - Search for a paper by title. - - Args: - title: Paper title to search for - max_results: Maximum number of results to retrieve - - Returns: - Best matching CrossRefResult or None if not found - """ + """Top-1 result. See `search_by_title_multi` for the candidate list.""" + results = self.search_by_title_multi(title, max_results=max_results) + return results[0] if results else None + + def search_by_title_multi(self, title: str, max_results: int = 5) -> List[CrossRefResult]: + """Return up to `max_results` candidates so callers can pick the best match.""" + if is_open(_SOURCE): + return [] self._rate_limit() - + params = { 'query.title': title, 'rows': max_results, 'select': 'title,author,published-print,published-online,DOI,publisher,container-title,abstract' } - + try: - response = self._session.get( + response = get_session().get( self.BASE_URL, params=params, headers=self._get_headers(), - timeout=30 + timeout=12, ) response.raise_for_status() - + data = response.json() - if data.get('status') != 'ok': - return None - - items = data.get('message', {}).get('items', []) - - if not items: - return None - - # Return best match (first result, as CrossRef ranks by relevance) - return self._parse_item(items[0]) - - except requests.RequestException: - return None + return [] + + items = data.get('message', {}).get('items', []) or [] + out: List[CrossRefResult] = [] + for it in items: + parsed = self._parse_item(it) + if parsed: + out.append(parsed) + record_success(_SOURCE) + return out + + except requests.RequestException as e: + logger.debug("CrossRef search_by_title(%s) failed: %s", title[:60], e, exc_info=True) + record_failure(_SOURCE) + return [] def search_by_doi(self, doi: str) -> Optional[CrossRefResult]: - """ - Fetch metadata by DOI. - - Args: - doi: DOI of the paper - - Returns: - CrossRefResult or None if not found - """ + """Fetch metadata by DOI. Honors circuit breaker.""" + if is_open(_SOURCE): + return None self._rate_limit() - - # Clean DOI (remove https://doi.org/ prefix if present) + doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '') - + try: - response = self._session.get( + response = get_session().get( f"{self.BASE_URL}/{doi}", headers=self._get_headers(), - timeout=30 + timeout=12, ) response.raise_for_status() - + data = response.json() - + if data.get('status') != 'ok': return None - + item = data.get('message', {}) + record_success(_SOURCE) return self._parse_item(item) - - except requests.RequestException: + + except requests.RequestException as e: + logger.debug("CrossRef search_by_doi(%s) failed: %s", doi, e, exc_info=True) + record_failure(_SOURCE) return None def _parse_item(self, item: dict) -> Optional[CrossRefResult]: diff --git a/src/fetchers/dblp_fetcher.py b/src/fetchers/dblp_fetcher.py index bb493d95f5fe049d2a6fc5a18e437534239be783..04ebcc00ffa591902583d0c5afd2ed32b8c4ac0a 100644 --- a/src/fetchers/dblp_fetcher.py +++ b/src/fetchers/dblp_fetcher.py @@ -4,6 +4,10 @@ import logging from typing import Optional, List, Dict, Any from dataclasses import dataclass +from src.utils.http import get_session, is_open, record_failure, record_success + +_SOURCE = "dblp" + @dataclass class DBLPResult: title: str @@ -32,41 +36,67 @@ class DBLPFetcher: self.last_request_time = time.time() def search_by_title(self, title: str) -> Optional[DBLPResult]: - """ - Search DBLP by title. - - Args: - title: Paper title to search for - - Returns: - DBLPResult if found, None otherwise - """ + """Top-1 result. See `search_by_title_multi` for the candidate list.""" + results = self.search_by_title_multi(title, max_results=5) + return results[0] if results else None + + def search_by_title_multi(self, title: str, max_results: int = 5) -> List[DBLPResult]: + """Return up to `max_results` DBLP hits. Honors circuit breaker.""" + if is_open(_SOURCE): + return [] self._wait_for_rate_limit() - - params = { - "q": title, - "format": "json", - "h": 3 # Limit to top 3 hits - } - + + params = {"q": title, "format": "json", "h": max_results} + try: - response = requests.get(self.BASE_URL, params=params, timeout=10) - + response = get_session().get(self.BASE_URL, params=params, timeout=8) + if response.status_code == 429: - self.logger.warning("DBLP rate limit exceeded. Waiting longer...") - time.sleep(5) - return None - + self.logger.warning("DBLP rate limit exceeded; tripping breaker") + record_failure(_SOURCE, threshold=2) # DBLP 429 is sticky + return [] + if response.status_code != 200: - self.logger.warning(f"DBLP API error: {response.status_code}") - return None - + self.logger.debug("DBLP API status %s for title=%r", response.status_code, title[:60]) + record_failure(_SOURCE) + return [] + data = response.json() - return self._parse_response(data, title) - + record_success(_SOURCE) + return self._parse_response_multi(data) + except Exception as e: - self.logger.error(f"Error fetching from DBLP: {e}") - return None + self.logger.debug("Error fetching from DBLP for title=%r: %s", title[:60], e, exc_info=True) + record_failure(_SOURCE) + return [] + + def _parse_response_multi(self, data: Dict[str, Any]) -> List[DBLPResult]: + out: List[DBLPResult] = [] + try: + hits = data.get("result", {}).get("hits", {}).get("hit", []) or [] + for hit in hits: + info = hit.get("info", {}) or {} + authors_data = info.get("authors", {}).get("author", []) + authors: List[str] = [] + if isinstance(authors_data, list): + authors = [a.get("text", "") for a in authors_data if isinstance(a, dict)] + elif isinstance(authors_data, dict): + authors = [authors_data.get("text", "")] + title = info.get("title", "") or "" + if title.endswith("."): + title = title[:-1] + doi = info.get("doi", "") + out.append(DBLPResult( + title=title, + authors=[a for a in authors if a], + year=info.get("year", ""), + venue=info.get("venue", ""), + url=info.get("url", ""), + doi=doi if doi else None, + )) + except Exception as e: + self.logger.error("DBLP parse failed: %s", e, exc_info=True) + return out def _parse_response(self, data: Dict[str, Any], query_title: str) -> Optional[DBLPResult]: """Parse DBLP JSON response.""" @@ -117,5 +147,5 @@ class DBLPFetcher: ) except Exception as e: - self.logger.error(f"Error parsing DBLP response: {e}") + self.logger.error("Error parsing DBLP response: %s", e, exc_info=True) return None diff --git a/src/fetchers/openalex_fetcher.py b/src/fetchers/openalex_fetcher.py index f978cc8d9e5a59e0918b3240804fbb04e6c6c105..2919ef7f069d96543b406abd61c79c46a9cf617a 100644 --- a/src/fetchers/openalex_fetcher.py +++ b/src/fetchers/openalex_fetcher.py @@ -2,12 +2,18 @@ OpenAlex API fetcher. Free and open API for scholarly metadata. """ +import logging import time from dataclasses import dataclass from typing import Optional import requests +from src.utils.http import get_session, is_open, record_failure, record_success + +logger = logging.getLogger(__name__) +_SOURCE = "openalex" + @dataclass class OpenAlexResult: @@ -36,25 +42,10 @@ class OpenAlexFetcher: RATE_LIMIT_DELAY = 0.1 # 10 req/sec max def __init__(self, email: Optional[str] = None): - """ - Initialize OpenAlex fetcher. - - Args: - email: Optional email for polite pool (faster rate limits) - """ + """OpenAlex fetcher. Shared session UA already includes contact email.""" self.email = email self._last_request_time = 0.0 - self._session = requests.Session() - - # Set user agent (required by OpenAlex) - self._session.headers.update({ - 'User-Agent': 'BibGuard/1.0 (https://github.com/thinkwee/BibGuard; mailto:bibguard@example.com)' - }) - - # Add email to polite pool if provided - if email: - self._session.headers.update({'From': email}) - + def _rate_limit(self): """Ensure rate limiting between requests.""" elapsed = time.time() - self._last_request_time @@ -63,62 +54,54 @@ class OpenAlexFetcher: self._last_request_time = time.time() def search_by_title(self, title: str, max_results: int = 5) -> Optional[OpenAlexResult]: - """ - Search for a paper by title. - - Args: - title: Paper title to search for - max_results: Maximum number of results to fetch (default: 5) - - Returns: - OpenAlexResult if found, None otherwise - """ + """Top-1 result. See `search_by_title_multi` for the candidate list.""" + results = self.search_by_title_multi(title, max_results=max_results) + return results[0] if results else None + + def search_by_title_multi(self, title: str, max_results: int = 5) -> list[OpenAlexResult]: + """Return up to `max_results` candidates. Honors circuit breaker.""" + if is_open(_SOURCE): + return [] self._rate_limit() - + url = f"{self.BASE_URL}/works" - params = { - 'search': title, - 'per-page': max_results - } - + params = {'search': title, 'per-page': max_results} + try: - response = self._session.get(url, params=params, timeout=10) + response = get_session().get(url, params=params, timeout=8) response.raise_for_status() data = response.json() - - results = data.get('results', []) - if not results: - return None - - # Return the first (most relevant) result - return self._parse_work(results[0]) - - except requests.RequestException: - return None + out: list[OpenAlexResult] = [] + for w in data.get('results', []) or []: + parsed = self._parse_work(w) + if parsed: + out.append(parsed) + record_success(_SOURCE) + return out + except requests.RequestException as e: + logger.debug("OpenAlex search_by_title(%s) failed: %s", title[:60], e, exc_info=True) + record_failure(_SOURCE) + return [] def fetch_by_doi(self, doi: str) -> Optional[OpenAlexResult]: - """ - Fetch paper metadata by DOI. - - Args: - doi: DOI of the paper - - Returns: - OpenAlexResult if found, None otherwise - """ + """Fetch paper metadata by DOI. Honors circuit breaker.""" + if is_open(_SOURCE): + return None self._rate_limit() - - # OpenAlex uses DOI URLs + doi_url = f"https://doi.org/{doi}" url = f"{self.BASE_URL}/works/{doi_url}" - + try: - response = self._session.get(url, timeout=10) + response = get_session().get(url, timeout=8) response.raise_for_status() data = response.json() + record_success(_SOURCE) return self._parse_work(data) - - except requests.RequestException: + + except requests.RequestException as e: + logger.debug("OpenAlex fetch_by_doi(%s) failed: %s", doi, e, exc_info=True) + record_failure(_SOURCE) return None def _parse_work(self, work_data: dict) -> Optional[OpenAlexResult]: diff --git a/src/fetchers/retraction_fetcher.py b/src/fetchers/retraction_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..a2cbc75b93e6dc433fecb047be62404b77c11565 --- /dev/null +++ b/src/fetchers/retraction_fetcher.py @@ -0,0 +1,107 @@ +""" +Retraction Watch / CrossRef retraction lookup. + +CrossRef exposes an `update-to` relation on retracted works. We query the +CrossRef Works API for a DOI and check the `update-to` and `update-policy` +fields. Retraction Watch's own API requires registration; CrossRef coverage +is broad enough to catch the majority of retracted ML/NLP papers. +""" +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Optional + +import requests + +from src.utils.http import get_session + +logger = logging.getLogger(__name__) + + +@dataclass +class RetractionResult: + is_retracted: bool + update_type: str = "" # "retraction", "correction", "expression-of-concern", ... + notice_doi: str = "" + notice_label: str = "" + notice_url: str = "" + + +class RetractionFetcher: + """Look up retraction status of a DOI via CrossRef.""" + + BASE_URL = "https://api.crossref.org/works" + + # We treat any of these as a hard red flag + HARD_FLAGS = {"retraction", "withdrawal", "removal"} + SOFT_FLAGS = {"expression-of-concern", "correction", "erratum"} + + def __init__(self, mailto: Optional[str] = None): + self.mailto = mailto + + def check(self, doi: str) -> Optional[RetractionResult]: + """Return RetractionResult or None on lookup failure.""" + if not doi: + return None + # Normalize DOI (strip URL prefix) + doi = doi.replace("https://doi.org/", "").replace("http://doi.org/", "").strip() + if not doi: + return None + + try: + response = get_session().get( + f"{self.BASE_URL}/{doi}", + headers={"Accept": "application/json"}, + timeout=20, + ) + if response.status_code == 404: + return None + response.raise_for_status() + except requests.RequestException as e: + logger.debug("Retraction lookup failed for %s: %s", doi, e, exc_info=True) + return None + + try: + data = response.json() + except ValueError: + return None + + if data.get("status") != "ok": + return None + + msg = data.get("message", {}) or {} + # CrossRef puts retraction notices under "update-to": [{"DOI": ..., "type": "retraction", ...}] + # and the *original* paper that is retracted has the notice as `update-to` or in `relation`. + notices = [] + for upd in msg.get("update-to") or []: + t = (upd.get("type") or "").lower().replace("_", "-") + if t: + notices.append((t, upd.get("DOI", ""), upd.get("label", ""))) + # Some retraction *notices themselves* have type:"retraction" in the work itself. + msg_type = (msg.get("type") or "").lower() + if msg_type in self.HARD_FLAGS: + notices.append((msg_type, doi, msg.get("title", [""])[0] if msg.get("title") else "")) + + if not notices: + return RetractionResult(is_retracted=False) + + # Pick the most severe + for t, ndoi, label in notices: + if t in self.HARD_FLAGS: + return RetractionResult( + is_retracted=True, + update_type=t, + notice_doi=ndoi, + notice_label=label, + notice_url=f"https://doi.org/{ndoi}" if ndoi else "", + ) + # Soft flag: not retraction but worth surfacing + t, ndoi, label = notices[0] + return RetractionResult( + is_retracted=False, + update_type=t, + notice_doi=ndoi, + notice_label=label, + notice_url=f"https://doi.org/{ndoi}" if ndoi else "", + ) diff --git a/src/fetchers/scholar_fetcher.py b/src/fetchers/scholar_fetcher.py index 47a3cd36fb8a9c88f3890e9f09cfd26e93582f2a..4ebfe859d38f0a4048082aa6a9d7c6bed66d5153 100644 --- a/src/fetchers/scholar_fetcher.py +++ b/src/fetchers/scholar_fetcher.py @@ -1,6 +1,7 @@ """ Google Scholar search (scraping-based fallback). """ +import logging import re import time import random @@ -10,6 +11,8 @@ from typing import Optional import requests from bs4 import BeautifulSoup +logger = logging.getLogger(__name__) + @dataclass class ScholarResult: @@ -97,12 +100,13 @@ class ScholarFetcher: ) response.raise_for_status() except requests.RequestException as e: + logger.debug("Google Scholar query failed: %s", e, exc_info=True) return [] - + # Check if we're blocked if 'unusual traffic' in response.text.lower() or response.status_code == 429: self._blocked = True - print(f"⚠️ Google Scholar blocked after {self._request_count} requests. Skipping further Scholar queries.") + logger.warning("Google Scholar blocked after %s requests; skipping further queries", self._request_count) return [] return self._parse_results(response.text, max_results) diff --git a/src/fetchers/semantic_scholar_fetcher.py b/src/fetchers/semantic_scholar_fetcher.py index 8170a2e9a658d00bf9e5d82a165a5183365b316b..a510aec64baba61282645fa702be2db3a74bc566 100644 --- a/src/fetchers/semantic_scholar_fetcher.py +++ b/src/fetchers/semantic_scholar_fetcher.py @@ -2,12 +2,18 @@ Semantic Scholar API fetcher. Official API with high quality metadata and generous rate limits. """ +import logging import time from dataclasses import dataclass from typing import Optional import requests +from src.utils.http import get_session, is_open, record_failure, record_success + +logger = logging.getLogger(__name__) +_SOURCE = "s2" + @dataclass class SemanticScholarResult: @@ -36,17 +42,16 @@ class SemanticScholarFetcher: def __init__(self, api_key: Optional[str] = None): """ - Initialize Semantic Scholar fetcher. - - Args: - api_key: Optional API key for higher rate limits (free from semanticscholar.org) + Semantic Scholar fetcher. Uses shared session; api_key is added per-call + as a header so the cache key includes it. """ self.api_key = api_key self._last_request_time = 0.0 - self._session = requests.Session() - - if api_key: - self._session.headers.update({'x-api-key': api_key}) + + def _headers(self) -> dict: + if self.api_key: + return {'x-api-key': self.api_key} + return {} def _rate_limit(self): """Ensure rate limiting between requests.""" @@ -56,95 +61,91 @@ class SemanticScholarFetcher: self._last_request_time = time.time() def search_by_title(self, title: str, max_results: int = 5) -> Optional[SemanticScholarResult]: + """Return the top-1 search result. See `search_by_title_multi` for the full list.""" + results = self.search_by_title_multi(title, max_results=max_results) + return results[0] if results else None + + def search_by_title_multi(self, title: str, max_results: int = 5) -> list[SemanticScholarResult]: """ - Search for a paper by title. - - Args: - title: Paper title to search for - max_results: Maximum number of results to fetch (default: 5) - - Returns: - SemanticScholarResult if found, None otherwise + Return up to `max_results` candidate results so callers can pick the best match. """ + if is_open(_SOURCE): + return [] self._rate_limit() - + url = f"{self.BASE_URL}/paper/search" params = { 'query': title, 'limit': max_results, 'fields': 'title,authors,year,abstract,paperId,citationCount,url' } - + try: - response = self._session.get(url, params=params, timeout=10) + response = get_session().get(url, params=params, headers=self._headers(), timeout=8) response.raise_for_status() data = response.json() - + papers = data.get('data', []) - if not papers: - return None - - # Return the first (most relevant) result - return self._parse_paper(papers[0]) - - except requests.RequestException: - return None + results: list[SemanticScholarResult] = [] + for p in papers: + parsed = self._parse_paper(p) + if parsed: + results.append(parsed) + record_success(_SOURCE) + return results + + except requests.RequestException as e: + logger.debug("S2 search_by_title(%s) failed: %s", title[:60], e, exc_info=True) + record_failure(_SOURCE) + return [] def fetch_by_doi(self, doi: str) -> Optional[SemanticScholarResult]: - """ - Fetch paper metadata by DOI. - - Args: - doi: DOI of the paper - - Returns: - SemanticScholarResult if found, None otherwise - """ + """Fetch paper metadata by DOI. Honors circuit breaker.""" + if is_open(_SOURCE): + return None self._rate_limit() - + url = f"{self.BASE_URL}/paper/DOI:{doi}" params = { 'fields': 'title,authors,year,abstract,paperId,citationCount,url' } - + try: - response = self._session.get(url, params=params, timeout=10) + response = get_session().get(url, params=params, headers=self._headers(), timeout=8) response.raise_for_status() data = response.json() + record_success(_SOURCE) return self._parse_paper(data) - - except requests.RequestException: + + except requests.RequestException as e: + logger.debug("S2 fetch_by_doi(%s) failed: %s", doi, e, exc_info=True) + record_failure(_SOURCE) return None - + def fetch_by_arxiv_id(self, arxiv_id: str) -> Optional[SemanticScholarResult]: - """ - Fetch paper metadata by arXiv ID. - - Args: - arxiv_id: arXiv ID (e.g., "2301.12345" or "arXiv:2301.12345") - - Returns: - SemanticScholarResult if found, None otherwise - """ + """Fetch paper metadata by arXiv ID. Honors circuit breaker.""" + if is_open(_SOURCE): + return None self._rate_limit() - - # Clean arXiv ID (remove "arXiv:" prefix if present) + clean_id = arxiv_id.replace('arXiv:', '') - url = f"{self.BASE_URL}/paper/ARXIV:{clean_id}" params = { 'fields': 'title,authors,year,abstract,paperId,citationCount,url' } - + try: - response = self._session.get(url, params=params, timeout=10) + response = get_session().get(url, params=params, headers=self._headers(), timeout=8) response.raise_for_status() data = response.json() + record_success(_SOURCE) return self._parse_paper(data) - - except requests.RequestException: + + except requests.RequestException as e: + logger.debug("S2 fetch_by_arxiv_id(%s) failed: %s", arxiv_id, e, exc_info=True) + record_failure(_SOURCE) return None - + def _parse_paper(self, paper_data: dict) -> Optional[SemanticScholarResult]: """Parse paper data from API response.""" try: diff --git a/src/parsers/__pycache__/__init__.cpython-311.pyc b/src/parsers/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index f017711d2f52991009793611123488f899922d17..0000000000000000000000000000000000000000 Binary files a/src/parsers/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/src/parsers/__pycache__/__init__.cpython-313.pyc b/src/parsers/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index a15dd1f751a12d6f5b7ff993742bb1678d571ec6..0000000000000000000000000000000000000000 Binary files a/src/parsers/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/parsers/__pycache__/bib_parser.cpython-311.pyc b/src/parsers/__pycache__/bib_parser.cpython-311.pyc deleted file mode 100644 index d4282597b28154a0a8adb3ce685336f6c9d67131..0000000000000000000000000000000000000000 Binary files a/src/parsers/__pycache__/bib_parser.cpython-311.pyc and /dev/null differ diff --git a/src/parsers/__pycache__/bib_parser.cpython-313.pyc b/src/parsers/__pycache__/bib_parser.cpython-313.pyc deleted file mode 100644 index 5b404d8ab5625452ced6d60b2ddd63a2c1603767..0000000000000000000000000000000000000000 Binary files a/src/parsers/__pycache__/bib_parser.cpython-313.pyc and /dev/null differ diff --git a/src/parsers/__pycache__/tex_parser.cpython-313.pyc b/src/parsers/__pycache__/tex_parser.cpython-313.pyc deleted file mode 100644 index e2fd788571f2145b5b053a9de4091a48329fa1e9..0000000000000000000000000000000000000000 Binary files a/src/parsers/__pycache__/tex_parser.cpython-313.pyc and /dev/null differ diff --git a/src/report/__pycache__/__init__.cpython-313.pyc b/src/report/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index bec69e5de0fd5315c6a0c642dc565d685207de90..0000000000000000000000000000000000000000 Binary files a/src/report/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/report/__pycache__/generator.cpython-313.pyc b/src/report/__pycache__/generator.cpython-313.pyc deleted file mode 100644 index 7e7df3494844c80e1581b16a5359eba1bd6653a2..0000000000000000000000000000000000000000 Binary files a/src/report/__pycache__/generator.cpython-313.pyc and /dev/null differ diff --git a/src/report/__pycache__/line_report.cpython-313.pyc b/src/report/__pycache__/line_report.cpython-313.pyc deleted file mode 100644 index 38464dba5318b23773c196083cfaece9c6dcd917..0000000000000000000000000000000000000000 Binary files a/src/report/__pycache__/line_report.cpython-313.pyc and /dev/null differ diff --git a/src/report/generator.py b/src/report/generator.py index 73fe74c12642a85d0b6e1a1b04122daebeb1815c..837c4b06bfd0d73ffaaf132ad80b123fe3a209ab 100644 --- a/src/report/generator.py +++ b/src/report/generator.py @@ -1,10 +1,11 @@ """ Report generator for bibliography check results. """ +import json import re -from dataclasses import dataclass +from dataclasses import asdict, dataclass, is_dataclass from datetime import datetime -from typing import Optional, List +from typing import Any, Optional, List, Dict from pathlib import Path from ..parsers.bib_parser import BibEntry @@ -13,6 +14,7 @@ from ..analyzers.usage_checker import UsageResult from ..analyzers.llm_evaluator import EvaluationResult from ..analyzers.duplicate_detector import DuplicateGroup from ..checkers.base import CheckResult, CheckSeverity +from .html_report import render_standalone_html @dataclass @@ -24,6 +26,14 @@ class EntryReport: evaluations: list[EvaluationResult] +def _json_default(o): + if is_dataclass(o): + return asdict(o) + if hasattr(o, "value"): + return o.value + return str(o) + + class ReportGenerator: """Generates formatted markdown reports.""" @@ -40,6 +50,14 @@ class ReportGenerator: self.template = None # Conference template if used self.check_preprint_ratio = check_preprint_ratio # Whether to check preprint ratio self.preprint_warning_threshold = preprint_warning_threshold # Threshold for preprint warning + self.retraction_findings: list = [] # F1 results + self.url_findings: list = [] # F2 results + + def set_retraction_findings(self, findings) -> None: + self.retraction_findings = list(findings or []) + + def set_url_findings(self, findings) -> None: + self.url_findings = list(findings or []) def add_entry_report(self, report: EntryReport): @@ -177,7 +195,6 @@ class ReportGenerator: "Unreferenced table": "Unreferenced Table", "Unreferenced section": "Unreferenced Section", "Unreferenced label": "Unreferenced Label", - "Multiple blank lines": "Multiple Blank Lines", "Citation from": "Old Citation (10+ years)", "Hedging language": "Hedging/Vague Language", "Redundant phrase": "Redundant Phrasing", @@ -233,20 +250,22 @@ class ReportGenerator: return "\n".join(lines) def _generate_header(self) -> list[str]: - """Generate report header.""" - bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A" - tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A" + """Generate report header. + + File names are intentionally not printed β€” keep the report + portable, and never expose local source paths to anyone the + report is shared with. + """ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - return [ "# Bibliography Validation Report", "", f"**Generated:** {timestamp}", "", - "| File Type | Filename |", - "|-----------|----------|", - f"| **Bib File(s)** | {bib_names} |", - f"| **TeX File(s)** | {tex_names} |" + "| Inputs | Count |", + "|--------|-------|", + f"| **Bib File(s)** | {len(self.bib_files)} |", + f"| **TeX File(s)** | {len(self.tex_files)} |", ] def _generate_disclaimer(self) -> list[str]: @@ -526,6 +545,13 @@ class ReportGenerator: lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`") else: lines.append(f" - πŸ”Έ {issue}") + + # Positive notes (corroboration, year-tolerance) β€” separate from issues. + notes = list(getattr(comp, "notes", []) or []) + if notes and not minimal: + lines.append(" - **Notes:**") + for note in notes: + lines.append(f" - 🟒 {note}") # Relevance Status if report.evaluations and not minimal: @@ -533,13 +559,8 @@ class ReportGenerator: for eval_res in report.evaluations: score_icon = "🟒" if eval_res.relevance_score >= 4 else ("🟑" if eval_res.relevance_score == 3 else "πŸ”΄") lines.append(f" - {score_icon} **Score {eval_res.relevance_score}/5** ({eval_res.score_label})") - loc = [] - if eval_res.file_path: - loc.append(f"File: `{Path(eval_res.file_path).name}`") if eval_res.line_number: - loc.append(f"Line {eval_res.line_number}") - if loc: - lines.append(f" - {' | '.join(loc)}") + lines.append(f" - Line {eval_res.line_number}") lines.append(f" - *\"{eval_res.explanation}\"*") lines.append("") @@ -589,42 +610,41 @@ class ReportGenerator: by_checker[result.checker_name] = [] by_checker[result.checker_name].append(result) + def _format_one(result) -> list[str]: + """Render a single CheckResult β€” line number only, no file path, + no truncation. The HTML report follows the same convention.""" + buf = [f"- {result.message}"] + if result.line_number: + buf.append(f" - Line {result.line_number}") + if result.line_content: + # Highlight the offending span if the checker provided one. + content = result.line_content + if getattr(result, "match_text", None) and result.match_text in content: + idx = content.index(result.match_text) + content = (content[:idx] + + "**" + result.match_text + "**" + + content[idx + len(result.match_text):]) + buf.append(f" - `{content}`") + if result.suggestion: + buf.append(f" - πŸ’‘ *{result.suggestion}*") + return buf + # Display errors first if errors: lines.append("### πŸ”΄ Critical Errors") lines.append("") for result in errors: - lines.append(f"- **{result.message}**") - loc = [] - if result.file_path: - loc.append(f"File: `{Path(result.file_path).name}`") - if result.line_number: - loc.append(f"Line {result.line_number}") - if loc: - lines.append(f" - {' | '.join(loc)}") - if result.line_content: - lines.append(f" - `{result.line_content[:80]}`") - if result.suggestion: - lines.append(f" - πŸ’‘ *{result.suggestion}*") + lines.extend(_format_one(result)) lines.append("") - + # Display warnings if warnings: lines.append("### 🟑 Warnings") lines.append("") for result in warnings: - lines.append(f"- {result.message}") - loc = [] - if result.file_path: - loc.append(f"File: `{Path(result.file_path).name}`") - if result.line_number: - loc.append(f"Line {result.line_number}") - if loc: - lines.append(f" - {' | '.join(loc)}") - if result.suggestion: - lines.append(f" - πŸ’‘ *{result.suggestion}*") + lines.extend(_format_one(result)) lines.append("") - + # Display suggestions (collapsible) if infos: lines.append("### πŸ”΅ Suggestions") @@ -632,16 +652,7 @@ class ReportGenerator: lines.append("Click to view suggestions") lines.append("") for result in infos: - lines.append(f"- {result.message}") - loc = [] - if result.file_path: - loc.append(f"File: `{Path(result.file_path).name}`") - if result.line_number: - loc.append(f"Line {result.line_number}") - if loc: - lines.append(f" - {' | '.join(loc)}") - if result.suggestion: - lines.append(f" - πŸ’‘ *{result.suggestion}*") + lines.extend(_format_one(result)) lines.append("") lines.append("") lines.append("") @@ -671,12 +682,10 @@ class ReportGenerator: lines.append("") lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") lines.append("") - bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A" - tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A" - lines.append("| File Type | Filename |") - lines.append("|-----------|----------|") - lines.append(f"| **Bib File(s)** | {bib_names} |") - lines.append(f"| **TeX File(s)** | {tex_names} |") + lines.append("| Inputs | Count |") + lines.append("|--------|-------|") + lines.append(f"| **Bib File(s)** | {len(self.bib_files)} |") + lines.append(f"| **TeX File(s)** | {len(self.tex_files)} |") lines.append("") # Disclaimer @@ -733,6 +742,131 @@ class ReportGenerator: with open(filepath, 'w', encoding='utf-8') as f: f.write(content) + # ------------------------------------------------------------------ + # JSON + standalone HTML output + # ------------------------------------------------------------------ + def build_payload(self) -> Dict[str, Any]: + """Build the JSON-serializable payload used by JSON & HTML outputs.""" + def _entry_dict(e: BibEntry) -> dict: + return { + "key": e.key, "entry_type": e.entry_type, "title": e.title, + "author": e.author, "year": e.year, "journal": e.journal, + "booktitle": e.booktitle, "publisher": e.publisher, + "doi": e.doi, "arxiv_id": e.arxiv_id, "url": e.url, + "volume": e.volume, "pages": e.pages, + } + + def _comparison_dict(c: Optional[ComparisonResult]) -> Optional[dict]: + if c is None: return None + return { + "is_match": c.is_match, "confidence": c.confidence, + "title_match": c.title_match, "title_similarity": c.title_similarity, + "author_match": c.author_match, "author_similarity": c.author_similarity, + "year_match": c.year_match, + "bib_title": c.bib_title, "fetched_title": c.fetched_title, + "bib_authors": c.bib_authors, "fetched_authors": c.fetched_authors, + "bib_year": c.bib_year, "fetched_year": c.fetched_year, + "issues": list(c.issues), "source": c.source, + "notes": list(getattr(c, "notes", []) or []), + "published_version_hint": getattr(c, "published_version_hint", ""), + } + + def _usage_dict(u: Optional[UsageResult]) -> Optional[dict]: + if u is None: return None + return {"is_used": u.is_used, "usage_count": getattr(u, "usage_count", 0)} + + def _eval_dict(ev: EvaluationResult) -> dict: + return { + "entry_key": ev.entry_key, + "relevance_score": ev.relevance_score, + "is_relevant": ev.is_relevant, + "explanation": ev.explanation, + "citation_role": getattr(ev, "citation_role", ""), + "line_number": ev.line_number, "file_path": ev.file_path, + "error": ev.error, + } + + entries_payload = [] + for r in self.entries: + entries_payload.append({ + "entry": _entry_dict(r.entry), + "comparison": _comparison_dict(r.comparison), + "usage": _usage_dict(r.usage), + "evaluations": [_eval_dict(ev) for ev in (r.evaluations or [])], + }) + + sub_payload = [] + for r in self.submission_results: + sub_payload.append({ + "checker": r.checker_name, "passed": r.passed, + "severity": r.severity.value if hasattr(r.severity, "value") else str(r.severity), + "message": r.message, "line_number": r.line_number, + "line_content": r.line_content, "suggestion": r.suggestion, + # file_path intentionally omitted β€” user-facing report should + # never expose local tex paths. + "match_text": getattr(r, "match_text", None), + }) + + retr_payload = [] + for f in self.retraction_findings: + res = getattr(f, "result", None) + retr_payload.append({ + "entry_key": getattr(f, "entry_key", ""), + "doi": getattr(f, "doi", ""), + "is_retracted": getattr(res, "is_retracted", False) if res else False, + "update_type": getattr(res, "update_type", "") if res else "", + "notice_doi": getattr(res, "notice_doi", "") if res else "", + "notice_label": getattr(res, "notice_label", "") if res else "", + "notice_url": getattr(res, "notice_url", "") if res else "", + }) + + url_payload = [] + for f in self.url_findings: + url_payload.append({ + "entry_key": getattr(f, "entry_key", ""), + "url": getattr(f, "url", ""), + "status": getattr(f, "status", ""), + "status_code": getattr(f, "status_code", None), + "detail": getattr(f, "detail", ""), + }) + + duplicates = [] + for grp in (self.duplicate_groups or []): + keys = [getattr(e, "key", "") for e in getattr(grp, "entries", [])] + duplicates.append([k for k in keys if k]) + + bib_stats, latex_stats = self.get_summary_stats() + return { + "meta": { + "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + # Counts only β€” never expose source filenames in any + # downstream artifact (HTML, JSON, anywhere else). + "bib_files_count": len(self.bib_files), + "tex_files_count": len(self.tex_files), + "template": getattr(self.template, "name", "") if self.template else "", + }, + "summary": {"bibliography": bib_stats, "latex": latex_stats}, + "entries": entries_payload, + "submission_results": sub_payload, + "retractions": retr_payload, + "url_findings": url_payload, + "duplicates": duplicates, + "missing_citations": list(self.missing_citations), + } + + def save_json(self, filepath: str) -> None: + """Write a machine-readable JSON dump of the full report.""" + payload = self.build_payload() + with open(filepath, "w", encoding="utf-8") as f: + json.dump(payload, f, ensure_ascii=False, indent=2, default=_json_default) + + def save_html(self, filepath: str) -> None: + """Write a single self-contained HTML report (CSS+JS inlined).""" + payload = self.build_payload() + html = render_standalone_html(payload) + with open(filepath, "w", encoding="utf-8") as f: + f.write(html) + def save_latex_quality_report(self, filepath: str, submission_results: List[CheckResult], template=None): """Generate and save LaTeX quality report (all tex-related quality checks).""" lines = [] @@ -742,8 +876,7 @@ class ReportGenerator: lines.append("") lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") lines.append("") - tex_names = ", ".join([f"`{Path(f).name}`" for f in self.tex_files]) if self.tex_files else "N/A" - lines.append(f"**TeX File(s):** {tex_names}") + lines.append(f"**Inputs:** {len(self.tex_files)} TeX file(s)") lines.append("") if template: diff --git a/src/report/html_report.py b/src/report/html_report.py new file mode 100644 index 0000000000000000000000000000000000000000..1c168f973eccfeb38228d26ebb79c9423a8c030d --- /dev/null +++ b/src/report/html_report.py @@ -0,0 +1,515 @@ +""" +Self-contained, single-file HTML report. + +The output is a single .html with all CSS and JS inlined: no external network +requests, opens cleanly with `open report.html` on any OS, supports light/dark +theme via prefers-color-scheme + manual toggle, and offers per-section +filtering (Verified/Unverified/Unused for bib; Errors/Warnings/Info for +LaTeX), full-text search, and inline highlighting of the offending substring +on each LaTeX-quality issue. + +The page is driven from a JSON blob embedded into the HTML, so re-rendering or +re-filtering is cheap. We deliberately avoid external libraries. + +The embedded JSON deliberately omits all source file paths β€” only counts +(`bib_files_count`, `tex_files_count`) reach the page so reports can be +shared without leaking local paths. +""" +from __future__ import annotations + +import html +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + + +# --------------------------------------------------------------------------- +# Public entrypoint +# --------------------------------------------------------------------------- + +def render_standalone_html(payload: Dict[str, Any]) -> str: + """ + Render a complete self-contained HTML report. + + `payload` shape (see ReportGenerator.build_payload): + { + "meta": { "generated_at": str, "bib_files_count": int, + "tex_files_count": int, "template": str }, + "summary": { ... bib + latex counts ... }, + "entries": [ { ... per-bib-entry ... } ], + "submission_results": [ { ... per-line LaTeX issues ... } ], + "retractions": [ { entry_key, doi, type, notice_url, label } ], + "url_findings": [ { entry_key, url, status, status_code, detail } ], + "duplicates": [ [keys...], ... ], + "missing_citations": [ "key1", "key2" ] + } + """ + blob = json.dumps(payload, ensure_ascii=False).replace(" + + + + +__TITLE__ + + + + +
+

πŸ›‘οΈ BibGuard Report

+

Loading…

+
+ +
+ +
+ + +
+ +
+ + + +
+ +
+
+
+ Show: + All 0 + βœ“ Verified 0 + ⚠ Unverified 0 + πŸ—‘ Unused 0 +
+
+
+ +
+
+
+ Severity: + All 0 + πŸ”΄ Errors 0 + 🟑 Warnings 0 + πŸ”΅ Info 0 +
+
+
+ +
+

🚫 Retractions

+
+

πŸ”— URL Liveness

+
+
+ +
BibGuard β€” single-file HTML report. Works offline.
+
+ + + + + +""" diff --git a/src/report/line_report.py b/src/report/line_report.py deleted file mode 100644 index c9bb3a84437284c284d6a21fa5ac2353c79705a1..0000000000000000000000000000000000000000 --- a/src/report/line_report.py +++ /dev/null @@ -1,254 +0,0 @@ -""" -Line-by-line report generator. - -Generates a report that follows the TeX file structure, -showing issues in order of appearance in the document. -""" -import re -from typing import List, Dict, Tuple, Optional -from dataclasses import dataclass, field -from collections import defaultdict -from datetime import datetime -from pathlib import Path - -from ..checkers.base import CheckResult, CheckSeverity - - -@dataclass -class LineIssue: - """An issue associated with a specific line or range.""" - start_line: int - end_line: int - line_content: str - issues: List[CheckResult] = field(default_factory=list) - block_type: Optional[str] = None # 'figure', 'table', 'equation', etc. - - -class LineByLineReportGenerator: - """ - Generates a report organized by TeX file line order. - - Groups consecutive lines and special environments into blocks, - then outputs issues in the order they appear in the document. - """ - - # LaTeX environments that should be grouped as blocks - BLOCK_ENVIRONMENTS = [ - 'figure', 'figure*', 'table', 'table*', 'tabular', 'tabular*', - 'equation', 'equation*', 'align', 'align*', 'gather', 'gather*', - 'algorithm', 'algorithm2e', 'algorithmic', 'lstlisting', - 'verbatim', 'minted', 'tikzpicture', 'minipage', 'subfigure', - ] - - def __init__(self, tex_content: str, tex_path: str): - self.tex_content = tex_content - self.tex_path = tex_path - self.lines = tex_content.split('\n') - self.line_issues: Dict[int, List[CheckResult]] = defaultdict(list) - self.blocks: List[Tuple[int, int, str]] = [] # (start, end, env_type) - - # Parse block environments - self._parse_blocks() - - def _parse_blocks(self): - """Find all block environments in the TeX content.""" - for env in self.BLOCK_ENVIRONMENTS: - env_escaped = env.replace('*', r'\*') - pattern = re.compile( - rf'\\begin\{{{env_escaped}\}}.*?\\end\{{{env_escaped}\}}', - re.DOTALL - ) - - for match in pattern.finditer(self.tex_content): - start_line = self._pos_to_line(match.start()) - end_line = self._pos_to_line(match.end()) - self.blocks.append((start_line, end_line, env)) - - # Sort blocks by start line - self.blocks.sort(key=lambda x: x[0]) - - def _pos_to_line(self, pos: int) -> int: - """Convert character position to line number (1-indexed).""" - return self.tex_content[:pos].count('\n') + 1 - - def add_results(self, results: List[CheckResult]): - """Add check results to the line-by-line mapping.""" - for result in results: - if result.passed: - continue - - line_num = result.line_number or 0 - if line_num > 0: - self.line_issues[line_num].append(result) - - def _get_block_for_line(self, line_num: int) -> Optional[Tuple[int, int, str]]: - """Check if a line is part of a block environment.""" - for start, end, env_type in self.blocks: - if start <= line_num <= end: - return (start, end, env_type) - return None - - def _get_block_content(self, start: int, end: int) -> str: - """Get content for a block of lines.""" - block_lines = self.lines[start-1:end] - if len(block_lines) > 10: - # Truncate long blocks - return '\n'.join(block_lines[:5]) + '\n [...]\n' + '\n'.join(block_lines[-3:]) - return '\n'.join(block_lines) - - def _severity_icon(self, severity: CheckSeverity) -> str: - """Get icon for severity level.""" - icons = { - CheckSeverity.ERROR: 'πŸ”΄', - CheckSeverity.WARNING: '🟑', - CheckSeverity.INFO: 'πŸ”΅', - } - return icons.get(severity, 'βšͺ') - - def generate(self) -> str: - """Generate the line-by-line report.""" - lines = [] - - # Header - lines.append("# BibGuard Line-by-Line Report") - lines.append("") - lines.append(f"**File:** `{Path(self.tex_path).name}`") - lines.append(f"**Generated at:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - lines.append("") - lines.append("---") - lines.append("") - - # Summary counts - error_count = sum(1 for issues in self.line_issues.values() - for r in issues if r.severity == CheckSeverity.ERROR) - warning_count = sum(1 for issues in self.line_issues.values() - for r in issues if r.severity == CheckSeverity.WARNING) - info_count = sum(1 for issues in self.line_issues.values() - for r in issues if r.severity == CheckSeverity.INFO) - - lines.append("## πŸ“Š Overview") - lines.append("") - lines.append(f"| πŸ”΄ Errors | 🟑 Warnings | πŸ”΅ Suggestions |") - lines.append(f"|:---------:|:-----------:|:--------------:|") - lines.append(f"| {error_count} | {warning_count} | {info_count} |") - lines.append("") - lines.append("---") - lines.append("") - - if not self.line_issues: - lines.append("πŸŽ‰ **No issues found!**") - return '\n'.join(lines) - - # Process lines in order - lines.append("## πŸ“ Line-by-Line Details") - lines.append("") - - processed_lines = set() - sorted_line_nums = sorted(self.line_issues.keys()) - - for line_num in sorted_line_nums: - if line_num in processed_lines: - continue - - issues = self.line_issues[line_num] - if not issues: - continue - - # Check if this line is part of a block - block = self._get_block_for_line(line_num) - - if block: - start, end, env_type = block - - # Mark all lines in block as processed - for ln in range(start, end + 1): - processed_lines.add(ln) - - # Collect all issues in this block - block_issues = [] - for ln in range(start, end + 1): - if ln in self.line_issues: - block_issues.extend(self.line_issues[ln]) - - if block_issues: - lines.append(f"### πŸ“¦ `{env_type}` Environment (Lines {start}-{end})") - lines.append("") - lines.append("```latex") - lines.append(self._get_block_content(start, end)) - lines.append("```") - lines.append("") - - # Group issues by type - for issue in block_issues: - icon = self._severity_icon(issue.severity) - lines.append(f"- {icon} **{issue.message}**") - if issue.suggestion: - lines.append(f" - πŸ’‘ {issue.suggestion}") - - lines.append("") - else: - # Single line - processed_lines.add(line_num) - - # Use custom line_content from CheckResult if available, otherwise get from file - custom_content = None - for issue in issues: - if issue.line_content: - custom_content = issue.line_content - break - - line_content = custom_content if custom_content else ( - self.lines[line_num - 1] if line_num <= len(self.lines) else "" - ) - - lines.append(f"### Line {line_num}") - lines.append("") - lines.append("```latex") - lines.append(line_content) - lines.append("```") - lines.append("") - - for issue in issues: - icon = self._severity_icon(issue.severity) - lines.append(f"- {icon} **{issue.message}**") - if issue.suggestion: - lines.append(f" - πŸ’‘ {issue.suggestion}") - - lines.append("") - - # Footer - lines.append("---") - lines.append("") - lines.append("*Report generated by BibGuard*") - - return '\n'.join(lines) - - def save(self, filepath: str): - """Save report to file.""" - content = self.generate() - with open(filepath, 'w', encoding='utf-8') as f: - f.write(content) - - -def generate_line_report( - tex_content: str, - tex_path: str, - results: List[CheckResult], - output_path: str -) -> str: - """ - Generate a line-by-line report from check results. - - Args: - tex_content: The TeX file content - tex_path: Path to the TeX file - results: List of check results from all checkers - output_path: Where to save the report - - Returns: - Path to the generated report - """ - generator = LineByLineReportGenerator(tex_content, tex_path) - generator.add_results(results) - generator.save(output_path) - return output_path diff --git a/src/templates/__pycache__/__init__.cpython-313.pyc b/src/templates/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index 4bea3b9a05696703f73715f8a425c228cecc7cb1..0000000000000000000000000000000000000000 Binary files a/src/templates/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/templates/__pycache__/base_template.cpython-313.pyc b/src/templates/__pycache__/base_template.cpython-313.pyc deleted file mode 100644 index c144a736268828d8c269b73827a944ae4bb49105..0000000000000000000000000000000000000000 Binary files a/src/templates/__pycache__/base_template.cpython-313.pyc and /dev/null differ diff --git a/src/templates/base_template.py b/src/templates/base_template.py index 4e186628afdb647ac9c437f0f33b5a7b7abc2f34..2688109928c52eec58086d98cafbc0407a7a8a97 100644 --- a/src/templates/base_template.py +++ b/src/templates/base_template.py @@ -1,8 +1,16 @@ """ Conference template definitions. -Each template contains conference-specific formatting requirements -and rules for paper submission quality checking. +Each template captures conference-specific submission requirements that we can +mechanically verify from the LaTeX source. Things that genuinely require a +compiled PDF (page count, font embedding, image bleed) are documented in +``extra_rules`` so the report still surfaces them as reminders to the author. + +The dataclass keeps every legacy field present (``name``, ``short_name``, +``page_limit_review/camera``, ``double_blind``, ``mandatory_sections``, +``optional_sections``, ``style_package``, ``checkers``, ``extra_rules``) so +older callers in ``src/ui/template_selector.py`` and the report generator +keep working unchanged. """ from dataclasses import dataclass, field from typing import List, Dict, Optional @@ -18,40 +26,56 @@ class ConferenceField(Enum): @dataclass class ConferenceTemplate: - """ - Template containing conference-specific submission requirements. - - Attributes: - name: Full conference name (e.g., "ACL 2025") - short_name: Short identifier (e.g., "acl") - field: Research field category - page_limit_review: Page limit for review submission (main content only) - page_limit_camera: Page limit for camera-ready (main content only) - double_blind: Whether the conference uses double-blind review - caption_table_above: Whether table captions should be above - caption_figure_below: Whether figure captions should be below - mandatory_sections: List of required sections (e.g., ["Limitations"]) - optional_sections: List of encouraged sections - style_package: Name of the LaTeX style package - checkers: List of checker names to run for this template - extra_rules: Additional conference-specific rules - """ + """Conference-specific submission requirements with verifiable per-venue rules.""" + + # === Identity === name: str short_name: str field: ConferenceField + + # === Page budget === page_limit_review: int page_limit_camera: int + references_excluded: bool = True # references don't count toward limit + appendix_excluded: bool = True + limitations_excluded: bool = False # ACL family puts Limitations outside the budget + ethics_excluded: bool = False # ACL family / ICLR exclude ethics + min_main_pages: int = 0 # ICLR strictly requires >=6 + + # === Anonymity (double-blind) === double_blind: bool = True + forbid_identifying_urls: bool = False # strict CVPR/ICCV/ECCV anonymization + forbid_acks_in_review: bool = False # acknowledgments must be omitted in review + arxiv_allowed: bool = True # most modern venues permit arXiv preprints + + # === Captions === caption_table_above: bool = True caption_figure_below: bool = True + + # === Required content === mandatory_sections: List[str] = field(default_factory=list) + mandatory_camera_sections: List[str] = field(default_factory=list) optional_sections: List[str] = field(default_factory=list) - style_package: str = "" + + # === Template / typesetting === + style_package: str = "" # \usepackage{} expected in preamble + doc_class: str = "" # \documentclass{} expected (e.g. 'llncs') + paper_size: str = "" # 'letter' | 'a4' | '' (skip) + single_column: bool = False + font_size_pt: int = 0 # 0 = skip + + # === Per-venue special deliverables === + requires_paper_checklist: bool = False # NeurIPS desk-rejects without it + requires_reproducibility_statement: bool = False # ICLR / NeurIPS encourage + requires_lay_summary_camera: bool = False # ICML camera-ready + requires_type1_fonts: bool = False # ICML + + # === Backwards-compat fields === checkers: List[str] = field(default_factory=lambda: [ 'caption', 'reference', 'ai_artifacts', 'formatting', 'anonymization' ]) extra_rules: Dict[str, str] = field(default_factory=dict) - + def to_dict(self) -> dict: return { 'name': self.name, @@ -61,13 +85,14 @@ class ConferenceTemplate: 'page_limit_camera': self.page_limit_camera, 'double_blind': self.double_blind, 'mandatory_sections': self.mandatory_sections, + 'mandatory_camera_sections': self.mandatory_camera_sections, 'optional_sections': self.optional_sections, 'checkers': self.checkers, } # ============================================================================ -# NLP Conferences (ACL, EMNLP, NAACL) +# NLP Conferences (ACL, EMNLP, NAACL) β€” share the *ACL style files # ============================================================================ ACL_TEMPLATE = ConferenceTemplate( @@ -76,31 +101,43 @@ ACL_TEMPLATE = ConferenceTemplate( field=ConferenceField.NLP, page_limit_review=8, page_limit_camera=9, + references_excluded=True, + limitations_excluded=True, + ethics_excluded=True, double_blind=True, + arxiv_allowed=True, mandatory_sections=["Limitations"], - optional_sections=["Ethical Considerations"], - style_package="acl2025", + optional_sections=["Ethical Considerations", "Ethics Statement"], + style_package="acl", + paper_size="a4", extra_rules={ - "format": "Two-column, A4 paper", - "references": "Unlimited pages for references", - "appendix": "Allowed after references, two-column format", - } + "format": "Two-column, A4 paper, 11pt", + "limitations_content": "Discussion only β€” no new methods/figures/results inside Limitations", + "appendix": "Allowed after references", + "responsible_nlp_checklist": "ARR Responsible NLP Research checklist (separate, not inline)", + }, ) EMNLP_TEMPLATE = ConferenceTemplate( name="EMNLP 2024", short_name="emnlp", field=ConferenceField.NLP, - page_limit_review=8, + page_limit_review=8, # long paper review page_limit_camera=9, + references_excluded=True, + limitations_excluded=True, + ethics_excluded=True, double_blind=True, + arxiv_allowed=True, mandatory_sections=["Limitations"], - optional_sections=["Ethics Statement"], - style_package="emnlp2024", + optional_sections=["Ethical Considerations", "Ethics Statement"], + style_package="acl", # *ACL share the same acl.sty + paper_size="a4", extra_rules={ - "format": "Two-column, single-spaced", - "short_paper": "4 pages for short papers (5 camera-ready)", - } + "short_paper": "4 pages for short papers (5 camera-ready), excluding refs/limitations/ethics", + "submission_route": "Submitted via ACL Rolling Review (ARR)", + "limitations_content": "Discussion only β€” no new methods/figures/results inside Limitations", + }, ) NAACL_TEMPLATE = ConferenceTemplate( @@ -109,18 +146,23 @@ NAACL_TEMPLATE = ConferenceTemplate( field=ConferenceField.NLP, page_limit_review=8, page_limit_camera=9, + references_excluded=True, + limitations_excluded=True, + ethics_excluded=True, double_blind=True, + arxiv_allowed=True, mandatory_sections=["Limitations"], - optional_sections=["Ethics Statement"], - style_package="naacl2025", + optional_sections=["Ethical Considerations", "Ethics Statement"], + style_package="acl", + paper_size="a4", extra_rules={ "review_system": "ACL Rolling Review (ARR)", "format": "Two-column, A4 paper", - } + }, ) # ============================================================================ -# Computer Vision Conferences (CVPR, ICCV, ECCV) +# Computer Vision Conferences (CVPR, ICCV, ECCV) β€” strict double-blind # ============================================================================ CVPR_TEMPLATE = ConferenceTemplate( @@ -128,16 +170,19 @@ CVPR_TEMPLATE = ConferenceTemplate( short_name="cvpr", field=ConferenceField.CV, page_limit_review=8, - page_limit_camera=8, # No extra page for camera-ready + page_limit_camera=8, # No extra page for camera-ready + references_excluded=True, double_blind=True, - mandatory_sections=[], - optional_sections=[], + forbid_identifying_urls=True, + forbid_acks_in_review=True, + arxiv_allowed=True, style_package="cvpr", + paper_size="letter", extra_rules={ - "strict_anonymity": "No links to websites that reveal identity", - "supplementary": "Separate PDF allowed, no page limit", - "references": "No limit on references", - } + "supplementary": "Separate PDF allowed; reviewers not obligated to view", + "rebuttal": "1 page max; no external links; no new contributions", + "anonymous_code": "Use Anonymous GitHub (https://anonymous.4open.science) for code links", + }, ) ICCV_TEMPLATE = ConferenceTemplate( @@ -146,14 +191,18 @@ ICCV_TEMPLATE = ConferenceTemplate( field=ConferenceField.CV, page_limit_review=8, page_limit_camera=8, + references_excluded=True, double_blind=True, - mandatory_sections=[], - optional_sections=[], + forbid_identifying_urls=True, + forbid_acks_in_review=True, + arxiv_allowed=True, style_package="iccv", + paper_size="letter", extra_rules={ "format": "Two-column, 10pt Times font", - "supplementary": "Optional PDF for extra material", - } + "supplementary": "Optional PDF; same deadline as main paper", + "anonymous_code": "Use Anonymous GitHub for code links during review", + }, ) ECCV_TEMPLATE = ConferenceTemplate( @@ -162,15 +211,19 @@ ECCV_TEMPLATE = ConferenceTemplate( field=ConferenceField.CV, page_limit_review=14, page_limit_camera=14, + references_excluded=True, double_blind=True, - mandatory_sections=[], - optional_sections=[], - style_package="eccv", + forbid_identifying_urls=True, + forbid_acks_in_review=True, + arxiv_allowed=True, + style_package="", # uses LNCS style file, not a usepackage + doc_class="llncs", + paper_size="a4", extra_rules={ - "format": "Springer LNCS format", - "template": "Do not use TIMES font, use default template font", - "headings": "Capitalize except articles/prepositions/conjunctions", - } + "format": "Springer LNCS format β€” use llncs.cls", + "headings": "Capitalize first letter of headings except articles/prepositions/conjunctions", + "fonts": "Use the LNCS default font; do NOT switch to Times", + }, ) # ============================================================================ @@ -183,15 +236,22 @@ NEURIPS_TEMPLATE = ConferenceTemplate( field=ConferenceField.ML, page_limit_review=9, page_limit_camera=10, + references_excluded=True, + appendix_excluded=True, double_blind=True, - mandatory_sections=["Paper Checklist"], - optional_sections=["Broader Impact"], + arxiv_allowed=True, + requires_paper_checklist=True, + requires_reproducibility_statement=True, + optional_sections=["Broader Impact", "Broader Impacts"], style_package="neurips_2025", + paper_size="letter", + single_column=True, extra_rules={ - "checklist": "NeurIPS paper checklist is MANDATORY, desk reject without it", - "appendix": "Technical appendix after checklist, no page limit", - "format": "Single PDF including main content, references, and checklist", - } + "checklist": "MANDATORY β€” papers without the NeurIPS Paper Checklist are desk rejected", + "checklist_position": "After references and supplementary material; outside page limit", + "appendix": "Technical appendix follows checklist; no page limit", + "single_pdf": "Single PDF: main content + references + checklist + appendix", + }, ) ICML_TEMPLATE = ConferenceTemplate( @@ -200,15 +260,23 @@ ICML_TEMPLATE = ConferenceTemplate( field=ConferenceField.ML, page_limit_review=8, page_limit_camera=9, + references_excluded=True, + appendix_excluded=True, double_blind=True, - mandatory_sections=["Impact Statement"], # Required for camera-ready - optional_sections=["Acknowledgments"], + arxiv_allowed=True, + mandatory_camera_sections=["Impact Statement"], + requires_lay_summary_camera=True, + requires_type1_fonts=True, style_package="icml2025", + paper_size="letter", + single_column=True, + font_size_pt=10, extra_rules={ - "font": "10 point Times, embedded Type-1 fonts only", - "lay_summary": "Plain language summary required for accepted papers", - "format": "Use pdflatex for best results", - } + "fonts": "Type-1 fonts only; embed all fonts in the PDF", + "lay_summary": "Plain-language summary required at camera-ready submission (OpenReview)", + "impact_statement": "Required (broader impact + ethics) at camera-ready, before References", + "compile_with": "Use pdflatex for best results", + }, ) ICLR_TEMPLATE = ConferenceTemplate( @@ -217,15 +285,22 @@ ICLR_TEMPLATE = ConferenceTemplate( field=ConferenceField.ML, page_limit_review=10, page_limit_camera=10, + references_excluded=True, + ethics_excluded=True, + min_main_pages=6, double_blind=True, - mandatory_sections=[], + arxiv_allowed=True, + requires_reproducibility_statement=True, optional_sections=["Ethics Statement", "Reproducibility Statement"], style_package="iclr2025_conference", + paper_size="letter", + single_column=True, extra_rules={ "format": "10pt Times New Roman, 11pt vertical spacing", - "submission": "Via OpenReview", - "min_pages": "Main text must be between 6 and 10 pages", - } + "submission": "OpenReview only", + "page_limit": "Strictly 6–10 pages of main text; 11th main-text page = desk reject", + "reproducibility_statement": "Encouraged at end of main text, before references; <=1 page; doesn't count toward limit", + }, ) # ============================================================================ diff --git a/src/ui/__pycache__/__init__.cpython-313.pyc b/src/ui/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index 9bd2ea8343637c994e457e05040eda25af3cb0fe..0000000000000000000000000000000000000000 Binary files a/src/ui/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/ui/__pycache__/template_selector.cpython-313.pyc b/src/ui/__pycache__/template_selector.cpython-313.pyc deleted file mode 100644 index ae6532abad95d6e03be3fb6976bf1110fe17326d..0000000000000000000000000000000000000000 Binary files a/src/ui/__pycache__/template_selector.cpython-313.pyc and /dev/null differ diff --git a/src/ui/__pycache__/workflow_editor.cpython-313.pyc b/src/ui/__pycache__/workflow_editor.cpython-313.pyc deleted file mode 100644 index 81b4ec5be6d7d659ee63902d7ebef17fc82c2fd6..0000000000000000000000000000000000000000 Binary files a/src/ui/__pycache__/workflow_editor.cpython-313.pyc and /dev/null differ diff --git a/src/utils/__pycache__/__init__.cpython-313.pyc b/src/utils/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index 9fa3d7bac9268b90e0eb75a748c60a6968a18742..0000000000000000000000000000000000000000 Binary files a/src/utils/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/src/utils/__pycache__/cache.cpython-313.pyc b/src/utils/__pycache__/cache.cpython-313.pyc deleted file mode 100644 index 3f38d16fda753fc16d22b7ff926ce8ba1bdd40cb..0000000000000000000000000000000000000000 Binary files a/src/utils/__pycache__/cache.cpython-313.pyc and /dev/null differ diff --git a/src/utils/__pycache__/logger.cpython-313.pyc b/src/utils/__pycache__/logger.cpython-313.pyc deleted file mode 100644 index eb8aab83bbd5eb9dd620dcef4ecbde88bb73e480..0000000000000000000000000000000000000000 Binary files a/src/utils/__pycache__/logger.cpython-313.pyc and /dev/null differ diff --git a/src/utils/__pycache__/normalizer.cpython-313.pyc b/src/utils/__pycache__/normalizer.cpython-313.pyc deleted file mode 100644 index dbddfbdb1c6abb30fffd5a6007d5c39cdcb1e4c4..0000000000000000000000000000000000000000 Binary files a/src/utils/__pycache__/normalizer.cpython-313.pyc and /dev/null differ diff --git a/src/utils/__pycache__/progress.cpython-313.pyc b/src/utils/__pycache__/progress.cpython-313.pyc deleted file mode 100644 index 149e039a4e27ed25266d6b6c9558e5d01c1d8b4d..0000000000000000000000000000000000000000 Binary files a/src/utils/__pycache__/progress.cpython-313.pyc and /dev/null differ diff --git a/src/utils/__pycache__/source_manager.cpython-313.pyc b/src/utils/__pycache__/source_manager.cpython-313.pyc deleted file mode 100644 index 6c47a85cfe68646c85c70cb7ba4f906a1652d9b2..0000000000000000000000000000000000000000 Binary files a/src/utils/__pycache__/source_manager.cpython-313.pyc and /dev/null differ diff --git a/src/utils/http.py b/src/utils/http.py new file mode 100644 index 0000000000000000000000000000000000000000..5542429f822c7ea13afd55f4f8904a506d577159 --- /dev/null +++ b/src/utils/http.py @@ -0,0 +1,179 @@ +""" +Shared HTTP client: pooled session + auto-retry + optional on-disk cache. + +All fetchers should go through `get_session()` instead of bare `requests.get`. +This gives them consistent retry/backoff on 429/5xx, polite-pool User-Agent, +and (when enabled) SQLite-backed response caching to skip re-querying the +same URL on re-runs. +""" +from __future__ import annotations + +import logging +import threading +from pathlib import Path +from typing import Optional + +import requests +from urllib3.util.retry import Retry +from requests.adapters import HTTPAdapter + +logger = logging.getLogger(__name__) + +# Global per-process state +_lock = threading.Lock() +_settings: dict = { + "contact_email": "", + "cache_enabled": True, + "cache_ttl_hours": 24, + "retry_total": 5, + "retry_backoff_factor": 1.5, + "cache_dir": None, # Path or None +} +_session: Optional[requests.Session] = None + + +def configure( + contact_email: str = "", + cache_enabled: bool = True, + cache_ttl_hours: int = 24, + retry_total: int = 5, + retry_backoff_factor: float = 1.5, + cache_dir: Optional[Path] = None, +) -> None: + """Configure HTTP layer. Call once at startup before any fetcher is used.""" + global _session + with _lock: + _settings.update({ + "contact_email": contact_email or "", + "cache_enabled": cache_enabled, + "cache_ttl_hours": int(cache_ttl_hours), + "retry_total": int(retry_total), + "retry_backoff_factor": float(retry_backoff_factor), + "cache_dir": cache_dir, + }) + # Force rebuild on next get_session() + _session = None + + +def user_agent() -> str: + """Build a polite User-Agent string. Includes contact email if configured.""" + email = _settings.get("contact_email") or "" + if email: + return f"BibGuard/1.0 (+https://github.com/thinkwee/BibGuard; mailto:{email})" + return "BibGuard/1.0 (+https://github.com/thinkwee/BibGuard)" + + +def _build_session() -> requests.Session: + """Construct a Session with retry and (optionally) caching.""" + cache_enabled = _settings["cache_enabled"] + ttl = _settings["cache_ttl_hours"] * 3600 + + if cache_enabled: + try: + from requests_cache import CachedSession # type: ignore + cache_dir = _settings.get("cache_dir") + if cache_dir is None: + cache_dir = Path.home() / ".cache" / "bibguard" + cache_dir.mkdir(parents=True, exist_ok=True) + session = CachedSession( + cache_name=str(cache_dir / "http_cache"), + backend="sqlite", + expire_after=ttl, + allowable_methods=("GET", "HEAD"), + allowable_codes=(200, 203, 300, 301, 308), + stale_if_error=True, + ) + logger.debug("HTTP cache enabled: %s (ttl=%ss)", cache_dir, ttl) + except ImportError: + logger.info( + "requests-cache not installed; running without HTTP cache. " + "Install via `pip install requests-cache` for big speedups on re-runs." + ) + session = requests.Session() + else: + session = requests.Session() + + # Important: 429 is NOT in status_forcelist. A 429 means "you're being + # rate-limited" β€” retrying just blocks the calling thread for tens of + # seconds while another parallel source could already have answered. + # We let the caller see the 429 immediately and move on; the circuit + # breaker (below) will skip the offending source for the rest of the run. + retry = Retry( + total=_settings["retry_total"], + backoff_factor=_settings["retry_backoff_factor"], + status_forcelist=(500, 502, 503, 504), + allowed_methods=("GET", "HEAD"), + raise_on_status=False, + respect_retry_after_header=False, + ) + adapter = HTTPAdapter(max_retries=retry, pool_connections=20, pool_maxsize=20) + session.mount("https://", adapter) + session.mount("http://", adapter) + session.headers.update({"User-Agent": user_agent()}) + return session + + +def get_session() -> requests.Session: + """Return the shared, configured Session. Thread-safe.""" + global _session + if _session is None: + with _lock: + if _session is None: + _session = _build_session() + return _session + + +def reset_for_tests() -> None: + """Drop the shared session. Used by tests to force a rebuild.""" + global _session + with _lock: + _session = None + + +# --------------------------------------------------------------------------- +# Circuit breaker: trip a source after N consecutive failures so the rest of +# the run skips it instead of paying its rate-limit/timeout penalty per entry. +# --------------------------------------------------------------------------- +_breakers: dict[str, dict] = {} +_breakers_lock = threading.Lock() + + +def is_open(source: str) -> bool: + """True if the source's circuit is currently tripped (skip it).""" + with _breakers_lock: + b = _breakers.get(source) + return bool(b and b.get("open")) + + +def record_failure(source: str, threshold: int = 3) -> bool: + """Note a failure for `source`; trip the breaker after `threshold`. + + Returns True if the breaker is now (or was already) open. + """ + with _breakers_lock: + b = _breakers.setdefault(source, {"failures": 0, "open": False}) + b["failures"] += 1 + if b["failures"] >= threshold: + if not b["open"]: + logger.warning( + "Circuit breaker tripped for %s after %d failures; " + "skipping for the rest of this run.", + source, b["failures"], + ) + b["open"] = True + return b["open"] + + +def record_success(source: str) -> None: + """Reset the failure counter on a success.""" + with _breakers_lock: + b = _breakers.get(source) + if b: + b["failures"] = 0 + b["open"] = False + + +def reset_breakers() -> None: + """Clear all breaker state (called at the start of a fresh run).""" + with _breakers_lock: + _breakers.clear() diff --git a/src/utils/logging_setup.py b/src/utils/logging_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..742987bde7ba801ccd30e4d7f61a9b4abea16e92 --- /dev/null +++ b/src/utils/logging_setup.py @@ -0,0 +1,183 @@ +""" +Logging bootstrap and per-run capture utilities. + +Design goals +------------ +1. **One env var to rule them all.** ``BIBGUARD_LOG=DEBUG`` (or + ``BIBGUARD_DEBUG=1``) turns on full tracebacks across the codebase. Default + is WARNING so stdout stays quiet during normal runs. + +2. **Always-on file log.** Even at WARNING console level we still write a + rotating DEBUG log to ``~/.cache/bibguard/logs/bibguard.log`` (override with + ``BIBGUARD_LOG_FILE``). That way, when something blows up mid-run you can + ``tail`` or grep the file after the fact β€” no need to rerun with --verbose. + +3. **Pinpoint location.** Formatter includes ``filename:lineno`` so any log + line tells you exactly which source line emitted it. + +4. **Per-run capture for the UI.** ``capture_run()`` is a context manager that + returns a buffer + path. The Gradio app attaches it at the start of each + check, then ships the resulting log as a downloadable artifact alongside + the HTML report. +""" +from __future__ import annotations + +import logging +import logging.handlers +import os +import sys +import tempfile +from contextlib import contextmanager +from io import StringIO +from pathlib import Path +from typing import Iterator, Optional + +# Format used for both console and file. ``%(filename)s:%(lineno)d`` is the +# important addition β€” it makes any traceback-free warning still navigable. +_FMT = "%(asctime)s %(levelname)-7s %(name)s %(filename)s:%(lineno)d β€” %(message)s" +_DATEFMT = "%H:%M:%S" + + +def _resolve(level: str | int) -> int: + if isinstance(level, int): + return level + return getattr(logging, str(level).upper(), logging.WARNING) + + +def _default_log_path() -> Path: + override = os.environ.get("BIBGUARD_LOG_FILE", "").strip() + if override: + return Path(override).expanduser() + return Path.home() / ".cache" / "bibguard" / "logs" / "bibguard.log" + + +def setup(level: str | int | None = None, *, quiet: bool = False, + log_file: Optional[Path | str] = None) -> Path: + """ + Configure root logger. + + Console level is controlled by ``level`` / ``BIBGUARD_LOG`` / ``quiet``. + Regardless of console level, a DEBUG-level rotating file is *always* + written so failures are reproducible after the fact. + + Returns the path to the active log file (useful for surfacing in the UI). + """ + # Resolve console level + if quiet: + console_level = logging.ERROR + elif os.environ.get("BIBGUARD_DEBUG", "").strip() in ("1", "true", "yes"): + console_level = logging.DEBUG + elif level is not None: + console_level = _resolve(level) + else: + console_level = _resolve(os.environ.get("BIBGUARD_LOG", "WARNING")) + + root = logging.getLogger() + root.setLevel(logging.DEBUG) # let handlers filter; root keeps everything + + # ------------------------------------------------------------- console + # If we already attached a console handler, reuse it (avoids duplicates + # when modules import this multiple times). + console_handler = None + for h in root.handlers: + if getattr(h, "_bibguard_console", False): + console_handler = h + break + if console_handler is None: + console_handler = logging.StreamHandler(sys.stderr) + console_handler._bibguard_console = True # type: ignore[attr-defined] + console_handler.setFormatter(logging.Formatter(fmt=_FMT, datefmt=_DATEFMT)) + root.addHandler(console_handler) + console_handler.setLevel(console_level) + + # ------------------------------------------------------------- file + log_path = Path(log_file).expanduser() if log_file else _default_log_path() + file_handler: Optional[logging.handlers.RotatingFileHandler] = None + for h in root.handlers: + if getattr(h, "_bibguard_file", False): + file_handler = h # type: ignore[assignment] + break + try: + if file_handler is None: + log_path.parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.handlers.RotatingFileHandler( + str(log_path), maxBytes=2_000_000, backupCount=3, encoding="utf-8", + ) + file_handler._bibguard_file = True # type: ignore[attr-defined] + file_handler.setFormatter(logging.Formatter(fmt=_FMT, datefmt=_DATEFMT)) + file_handler.setLevel(logging.DEBUG) + root.addHandler(file_handler) + except OSError as e: + # Non-fatal: filesystem unavailable, fall back to stderr-only. + root.warning("File logging disabled (%s); stderr only.", e) + + # Quiet down noisy third-party loggers unless we're in DEBUG console mode. + if console_level > logging.DEBUG: + for noisy in ("urllib3", "requests", "requests_cache", "bibtexparser"): + logging.getLogger(noisy).setLevel(logging.WARNING) + else: + for noisy in ("urllib3", "requests", "requests_cache", "bibtexparser"): + logging.getLogger(noisy).setLevel(logging.INFO) + + return log_path + + +@contextmanager +def capture_run(target_path: Optional[Path] = None) -> Iterator[tuple[Path, "_RunStats"]]: + """ + Attach a temporary DEBUG-level file handler for the duration of a single run. + + Yields ``(path, stats)`` where: + * ``path`` is the per-run log file written into the report's output dir + (or a temp file if ``target_path`` is None). + * ``stats`` exposes ``warnings`` / ``errors`` counters so the UI can + surface "N warnings logged" without reading the file. + + Used by ``app.py`` so each Gradio run produces a self-contained + ``bibguard.log`` next to ``report.html`` that the user can download. + """ + path = target_path or Path(tempfile.NamedTemporaryFile( + suffix=".log", prefix="bibguard_run_", delete=False + ).name) + path.parent.mkdir(parents=True, exist_ok=True) + + handler = logging.FileHandler(str(path), mode="w", encoding="utf-8") + handler.setFormatter(logging.Formatter(fmt=_FMT, datefmt=_DATEFMT)) + handler.setLevel(logging.DEBUG) + + stats = _RunStats() + handler.addFilter(stats) # filters can also count + + root = logging.getLogger() + root.addHandler(handler) + try: + yield path, stats + finally: + try: + handler.flush() + handler.close() + except Exception: + pass + try: + root.removeHandler(handler) + except ValueError: + pass + + +class _RunStats(logging.Filter): + """Logging filter that just counts warning+ records (always returns True).""" + + def __init__(self) -> None: + super().__init__() + self.warnings = 0 + self.errors = 0 + self.exceptions = 0 + + def filter(self, record: logging.LogRecord) -> bool: # type: ignore[override] + if record.levelno >= logging.ERROR: + self.errors += 1 + if record.exc_info: + self.exceptions += 1 + elif record.levelno >= logging.WARNING: + self.warnings += 1 + return True diff --git a/src/utils/progress.py b/src/utils/progress.py index d8a6b66098113b6eccb4a9d014b47894bc157945..9199e683e44b3141183d5166a86e41398e95d8f0 100644 --- a/src/utils/progress.py +++ b/src/utils/progress.py @@ -188,9 +188,10 @@ class ProgressDisplay: guide_table.add_column("File Name", style="cyan") guide_table.add_column("Description", style="dim") + guide_table.add_row("report.html", "Self-contained interactive HTML β€” opens offline, dark-mode aware") guide_table.add_row("bibliography_report.md", "Detailed metadata and usage issues for each bib entry") - guide_table.add_row("latex_quality_report.md", "Summary of all LaTeX writing and formatting issues") - guide_table.add_row("line_by_line_report.md", "All LaTeX issues sorted by line number for easy fixing") + guide_table.add_row("latex_quality_report.md", "All LaTeX writing/formatting issues, grouped by severity") + guide_table.add_row("report.json", "Machine-readable dump (only when 'json' is in output.formats)") guide_table.add_row("*_only_used.bib", "A cleaned version of your bib file containing only cited entries") self.console.print(Panel( diff --git a/src/utils/validation.py b/src/utils/validation.py new file mode 100644 index 0000000000000000000000000000000000000000..6f352b8ad6316754e7b34911b8d34cb69e2eff42 --- /dev/null +++ b/src/utils/validation.py @@ -0,0 +1,121 @@ +""" +Pre-flight validation for user-supplied .bib / .tex inputs. + +Catch obvious problems (giant files, files that don't actually contain bibs/cites) +*before* spending five minutes on metadata fetches. +""" +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import List + +logger = logging.getLogger(__name__) + + +@dataclass +class ValidationReport: + ok: bool = True + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + + def add_error(self, msg: str) -> None: + self.errors.append(msg) + self.ok = False + + def add_warning(self, msg: str) -> None: + self.warnings.append(msg) + + +# Sensible thresholds; tuned for typical CS/ML papers. +MAX_BIB_BYTES = 5 * 1024 * 1024 # 5 MB +MAX_BIB_ENTRIES = 5000 +MAX_TEX_BYTES = 10 * 1024 * 1024 # 10 MB + +_BIB_ENTRY = re.compile(r"^@\w+\s*\{", re.MULTILINE) +_TEX_HAS_CITES = re.compile(r"\\(?:cite|citep|citet|citeauthor|citeyear|nocite|parencite|textcite)\b") +_TEX_HAS_BIB = re.compile(r"\\(?:bibliography|addbibresource|printbibliography|bibitem)\b") + + +def validate_bib(path: Path) -> ValidationReport: + """Pre-flight check on a .bib file.""" + rep = ValidationReport() + if not path.exists(): + rep.add_error(f"Bib file does not exist: {path}") + return rep + if not path.is_file(): + rep.add_error(f"Not a file: {path}") + return rep + + size = path.stat().st_size + if size == 0: + rep.add_error(f"Bib file is empty: {path}") + return rep + if size > MAX_BIB_BYTES: + rep.add_warning( + f".bib file is large ({size/1024/1024:.1f} MB). Metadata checks may be slow." + ) + + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError as e: + rep.add_error(f"Cannot read bib file: {e}") + return rep + + entries = _BIB_ENTRY.findall(text) + n = len(entries) + if n == 0: + rep.add_error(f"No bibtex entries (`@type{{...}}`) found in {path.name}.") + elif n > MAX_BIB_ENTRIES: + rep.add_warning( + f"{n} entries in {path.name}; metadata checks may take a long time." + ) + return rep + + +def validate_tex(path: Path) -> ValidationReport: + """Pre-flight check on a .tex file.""" + rep = ValidationReport() + if not path.exists(): + rep.add_error(f"TeX file does not exist: {path}") + return rep + if not path.is_file(): + rep.add_error(f"Not a file: {path}") + return rep + + size = path.stat().st_size + if size == 0: + rep.add_error(f"TeX file is empty: {path}") + return rep + if size > MAX_TEX_BYTES: + rep.add_warning( + f".tex file is large ({size/1024/1024:.1f} MB). Some checks scan whole content." + ) + + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError as e: + rep.add_error(f"Cannot read tex file: {e}") + return rep + + has_cite = bool(_TEX_HAS_CITES.search(text)) + has_bib_decl = bool(_TEX_HAS_BIB.search(text)) + if not (has_cite or has_bib_decl): + rep.add_warning( + f"{path.name} contains no \\cite{{...}} and no \\bibliography{{...}} β€” " + "BibGuard's bibliography checks won't find anything to verify." + ) + return rep + + +def format_report(rep: ValidationReport, label: str = "") -> str: + """Pretty-print a ValidationReport for stdout.""" + parts = [] + prefix = f"[{label}] " if label else "" + for e in rep.errors: + parts.append(f"{prefix}ERROR: {e}") + for w in rep.warnings: + parts.append(f"{prefix}WARN: {w}") + return "\n".join(parts)