thinkwee commited on
Commit
fcffa22
·
1 Parent(s): 79d7264
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +95 -11
  2. README.md +166 -111
  3. app.py +1109 -793
  4. app_helper.py +305 -96
  5. bibguard.yaml +40 -7
  6. main.py +214 -166
  7. requirements.txt +1 -0
  8. scripts/install-hook.sh +53 -0
  9. src/__pycache__/__init__.cpython-311.pyc +0 -0
  10. src/__pycache__/__init__.cpython-313.pyc +0 -0
  11. src/analyzers/__pycache__/__init__.cpython-313.pyc +0 -0
  12. src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc +0 -0
  13. src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc +0 -0
  14. src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc +0 -0
  15. src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc +0 -0
  16. src/analyzers/__pycache__/retraction_checker.cpython-313.pyc +0 -0
  17. src/analyzers/__pycache__/url_validator.cpython-313.pyc +0 -0
  18. src/analyzers/__pycache__/usage_checker.cpython-313.pyc +0 -0
  19. src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc +0 -0
  20. src/analyzers/llm_evaluator.py +229 -81
  21. src/analyzers/metadata_comparator.py +29 -7
  22. src/checkers/__init__.py +3 -0
  23. src/checkers/__pycache__/__init__.cpython-313.pyc +0 -0
  24. src/checkers/__pycache__/acronym_checker.cpython-313.pyc +0 -0
  25. src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc +0 -0
  26. src/checkers/__pycache__/anonymization_checker.cpython-313.pyc +0 -0
  27. src/checkers/__pycache__/base.cpython-313.pyc +0 -0
  28. src/checkers/__pycache__/caption_checker.cpython-313.pyc +0 -0
  29. src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc +0 -0
  30. src/checkers/__pycache__/consistency_checker.cpython-313.pyc +0 -0
  31. src/checkers/__pycache__/equation_checker.cpython-313.pyc +0 -0
  32. src/checkers/__pycache__/formatting_checker.cpython-313.pyc +0 -0
  33. src/checkers/__pycache__/number_checker.cpython-313.pyc +0 -0
  34. src/checkers/__pycache__/reference_checker.cpython-313.pyc +0 -0
  35. src/checkers/__pycache__/sentence_checker.cpython-313.pyc +0 -0
  36. src/checkers/acronym_checker.py +13 -6
  37. src/checkers/ai_artifacts_checker.py +3 -3
  38. src/checkers/anonymization_checker.py +3 -3
  39. src/checkers/base.py +10 -4
  40. src/checkers/citation_quality_checker.py +1 -1
  41. src/checkers/consistency_checker.py +26 -6
  42. src/checkers/formatting_checker.py +6 -41
  43. src/checkers/retraction_checker.py +53 -0
  44. src/checkers/sentence_checker.py +1 -1
  45. src/checkers/template_checker.py +393 -0
  46. src/checkers/url_checker.py +80 -0
  47. src/config/__pycache__/__init__.cpython-313.pyc +0 -0
  48. src/config/__pycache__/workflow.cpython-313.pyc +0 -0
  49. src/config/__pycache__/yaml_config.cpython-313.pyc +0 -0
  50. src/config/yaml_config.py +92 -7
.gitignore CHANGED
@@ -1,9 +1,13 @@
 
1
  # Python
 
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
5
  *.so
6
  .Python
 
 
7
  build/
8
  develop-eggs/
9
  dist/
@@ -20,32 +24,96 @@ wheels/
20
  .installed.cfg
21
  *.egg
22
  MANIFEST
 
 
23
 
24
- # Virtual Environments
 
 
25
  venv/
26
  env/
27
  .env
 
 
28
  .venv/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # IDEs
 
 
31
  .idea/
32
  .vscode/
33
  *.swp
34
  *.swo
 
 
 
 
35
 
36
- # macOS
 
 
37
  .DS_Store
38
  .AppleDouble
39
  .LSOverride
 
 
40
 
41
- # Project Specific Outputs
42
- *.txt
43
- *.md
44
- !README.md
 
 
 
 
 
 
 
 
45
  *_only_used_entry.bib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # LaTeX and Bibliography (User Data)
48
- # Ignoring these to prevent committing personal paper content
 
 
49
  *.tex
50
  *.bib
51
  *.pdf
@@ -57,6 +125,22 @@ env/
57
  *.synctex.gz
58
  *.fls
59
  *.fdb_latexmk
 
 
 
 
 
 
60
 
61
- # cache
62
- .cache
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
  # Python
3
+ # =============================================================================
4
  __pycache__/
5
  *.py[cod]
6
  *$py.class
7
  *.so
8
  .Python
9
+
10
+ # Distribution / packaging
11
  build/
12
  develop-eggs/
13
  dist/
 
24
  .installed.cfg
25
  *.egg
26
  MANIFEST
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
 
30
+ # =============================================================================
31
+ # Virtual environments / dependency managers
32
+ # =============================================================================
33
  venv/
34
  env/
35
  .env
36
+ .env.*
37
+ !.env.example
38
  .venv/
39
+ .python-version
40
+ .tool-versions
41
+
42
+ # =============================================================================
43
+ # Test / type / lint caches
44
+ # =============================================================================
45
+ .pytest_cache/
46
+ .cache/
47
+ .coverage
48
+ .coverage.*
49
+ htmlcov/
50
+ coverage.xml
51
+ .tox/
52
+ .nox/
53
+ .mypy_cache/
54
+ .ruff_cache/
55
+ .pyre/
56
+ .pytype/
57
 
58
+ # =============================================================================
59
+ # IDEs / editors
60
+ # =============================================================================
61
  .idea/
62
  .vscode/
63
  *.swp
64
  *.swo
65
+ *~
66
+ *.iml
67
+ .project
68
+ .pydevproject
69
 
70
+ # =============================================================================
71
+ # OS noise
72
+ # =============================================================================
73
  .DS_Store
74
  .AppleDouble
75
  .LSOverride
76
+ Thumbs.db
77
+ desktop.ini
78
 
79
+ # =============================================================================
80
+ # Gradio / Hugging Face Spaces
81
+ # =============================================================================
82
+ .gradio/
83
+ gradio_cached_examples/
84
+ flagged/
85
+
86
+ # =============================================================================
87
+ # BibGuard outputs (generated by main.py / app.py)
88
+ # =============================================================================
89
+ bibguard_output/
90
+ *_only_used.bib
91
  *_only_used_entry.bib
92
+ bibliography_report.md
93
+ latex_quality_report.md
94
+ line_by_line_report.md
95
+ report.html
96
+ report.json
97
+ # Local HTTP cache used by src/utils/http.py
98
+ .cache/bibguard/
99
+ **/.cache/bibguard/
100
+
101
+ # =============================================================================
102
+ # User secrets / personal config
103
+ # Recommendation: ship `bibguard.example.yaml` and gitignore the real one
104
+ # so API keys / personal paths don't leak. See README for details.
105
+ # =============================================================================
106
+ # bibguard.yaml
107
+ config.yaml
108
+ .bibguard.yaml
109
+ .bibguard.yml
110
+ secrets.yaml
111
+ *.local.yaml
112
 
113
+ # =============================================================================
114
+ # User paper data (LaTeX / BibTeX sources and build artifacts)
115
+ # Keep README.md, requirements*.txt, and source-tree .md files.
116
+ # =============================================================================
117
  *.tex
118
  *.bib
119
  *.pdf
 
125
  *.synctex.gz
126
  *.fls
127
  *.fdb_latexmk
128
+ *.toc
129
+ *.lof
130
+ *.lot
131
+ *.nav
132
+ *.snm
133
+ *.vrb
134
 
135
+ # Markdown / text files: ignore by default to prevent committing user paper
136
+ # content, but keep documentation and project metadata.
137
+ *.txt
138
+ *.md
139
+ !README.md
140
+ !CHANGELOG.md
141
+ !CONTRIBUTING.md
142
+ !LICENSE.md
143
+ !docs/**/*.md
144
+ !requirements.txt
145
+ !requirements-*.txt
146
+ !**/requirements.txt
README.md CHANGED
@@ -11,35 +11,46 @@ pinned: false
11
 
12
  # BibGuard: Bibliography & LaTeX Quality Auditor
13
 
14
- **BibGuard** is your comprehensive quality assurance tool for academic papers. It validates bibliography entries against real-world databases and checks LaTeX submission quality to catch errors before you submit.
15
 
16
- AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and uses advanced LLMs to ensure cited papers actually support your claims.
17
 
18
  ## 🛡 Why BibGuard?
19
 
20
- - **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
21
- - **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, and submission compliance problems
22
- - **🔒 Safe & Non-Destructive**: Your original files are **never modified** - only detailed reports are generated
23
- - **🧠 Contextual Relevance**: Ensure cited papers actually discuss what you claim (with LLM)
24
- - ** Efficiency Boost**: Drastically reduce time needed to manually verify hundreds of citations
 
 
25
 
26
  ## 🚀 Features
27
 
28
  ### Bibliography Validation
29
- - **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
30
- - **🤖 AI Relevance Check**: Uses LLMs to verify citations match their context (optional)
31
- - **📊 Preprint Detection**: Warns if >50% of references are preprints (arXiv, bioRxiv, etc.)
32
- - **👀 Usage Analysis**: Highlights missing citations and unused bib entries
33
- - **👯 Duplicate Detector**: Identifies duplicate entries with fuzzy matching
 
 
34
 
35
  ### LaTeX Quality Checks
36
- - **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
37
- - **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
38
- - **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology
39
- - **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
40
- - **🔠 Acronym Validation**: Ensures acronyms are defined before use (smart matching)
41
- - **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
42
- - **📅 Citation Age**: Flags references older than 30 years
 
 
 
 
 
 
 
43
 
44
  ## 📦 Installation
45
 
@@ -57,10 +68,9 @@ pip install -r requirements.txt
57
  python main.py --init
58
  ```
59
 
60
- This creates `config.yaml`. Edit it to set your file paths. You have two modes:
61
 
62
- #### Option A: Single File Mode
63
- Best for individual papers.
64
  ```yaml
65
  files:
66
  bib: "paper.bib"
@@ -68,141 +78,186 @@ files:
68
  output_dir: "bibguard_output"
69
  ```
70
 
71
- #### Option B: Directory Scan Mode
72
- Best for large projects or a collection of papers. BibGuard will recursively search for all `.tex` and `.bib` files.
73
  ```yaml
74
  files:
75
  input_dir: "./my_project_dir"
76
  output_dir: "bibguard_output"
77
  ```
78
 
79
- ### 2. Run Full Check
80
 
81
  ```bash
82
- python main.py
 
 
 
 
 
83
  ```
84
 
85
- **Output** (in `bibguard_output/`):
86
- - `bibliography_report.md` - Bibliography validation results
87
- - `latex_quality_report.md` - Writing and formatting issues
88
- - `line_by_line_report.md` - All issues sorted by line number
89
- - `*_only_used.bib` - Clean bibliography (used entries only)
 
90
 
91
  ## 🛠 Configuration
92
 
93
- Edit `config.yaml` to customize checks:
94
 
95
  ```yaml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  bibliography:
97
- check_metadata: true # Validate against online databases (takes time)
98
- check_usage: true # Find unused/missing entries
99
- check_duplicates: true # Detect duplicate entries
100
- check_preprint_ratio: true # Warn if >50% are preprints
101
  check_relevance: false # LLM-based relevance check (requires API key)
102
 
103
- submission:
104
- # Format checks
105
- caption: true # Table/figure caption placement
106
- reference: true # Cross-reference integrity
107
- formatting: true # Citation spacing, blank lines
108
- equation: true # Equation punctuation, numbering
109
-
110
- # Writing quality
111
- sentence: true # Weak starters, hedging language
112
- consistency: true # Spelling, hyphenation, terminology
113
- acronym: true # Acronym definitions (3+ letters)
114
-
115
- # Submission compliance
116
- ai_artifacts: true # AI-generated text detection
117
- anonymization: true # Double-blind compliance
118
- citation_quality: true # Old citations (>30 years)
119
- number: true # Percentage formatting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  ```
121
 
122
- ## 🤖 LLM-Based Relevance Check
123
 
124
- To verify citations match their context using AI:
125
 
126
- ```yaml
127
- bibliography:
128
- check_relevance: true
 
 
 
 
129
 
130
- llm:
131
- backend: "gemini" # Options: gemini, openai, anthropic, deepseek, ollama, vllm
132
- api_key: "" # Or use environment variable (e.g., GEMINI_API_KEY)
 
 
 
 
 
133
  ```
134
 
135
- **Supported Backends:**
136
- - **Gemini** (Google): `GEMINI_API_KEY`
137
- - **OpenAI**: `OPENAI_API_KEY`
138
- - **Anthropic**: `ANTHROPIC_API_KEY`
139
- - **DeepSeek**: `DEEPSEEK_API_KEY` (recommended for cost/performance)
140
- - **Ollama**: Local models (no API key needed)
141
- - **vLLM**: Custom endpoint
 
 
 
 
 
 
142
 
143
- Then run:
144
  ```bash
145
- python main.py
 
146
  ```
147
 
 
 
148
  ## 📝 Understanding Reports
149
 
150
- ### Bibliography Report
151
- Shows for each entry:
152
- - **Verified**: Metadata matches online databases
153
- - ⚠️ **Issues**: Mismatches, missing entries, duplicates
154
- - 📊 **Statistics**: Usage, duplicates, preprint ratio
 
 
 
155
 
156
- ### LaTeX Quality Report
157
- Organized by severity:
158
- - 🔴 **Errors**: Critical issues (e.g., undefined references)
159
- - 🟡 **Warnings**: Important issues (e.g., inconsistent spelling)
160
- - 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters)
161
 
162
- ### Line-by-Line Report
163
- All LaTeX issues sorted by line number for easy fixing.
164
 
165
  ## 🧐 Understanding Mismatches
166
 
167
  BibGuard is strict, but false positives happen:
168
 
169
- 1. **Year Discrepancy (±1 Year)**:
170
- - *Reason*: Delay between preprint (arXiv) and official publication
171
- - *Action*: Verify which version you intend to cite
172
-
173
- 2. **Author List Variations**:
174
- - *Reason*: Different databases handle large author lists differently
175
- - *Action*: Check if primary authors match
176
-
177
- 3. **Venue Name Differences**:
178
- - *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems")
179
- - *Action*: Both are usually correct
180
 
181
- 4. **Non-Academic Sources**:
182
- - *Reason*: Blogs, documentation not indexed by academic databases
183
- - *Action*: Manually verify URL and title
184
 
185
- ## 🔧 Advanced Options
186
-
187
- ```bash
188
- python main.py --help # Show all options
189
- python main.py --list-templates # List conference templates
190
- python main.py --config my.yaml # Use custom config file
191
- ```
192
 
193
  ## 🤝 Contributing
194
 
195
- Contributions welcome! Please open an issue or pull request.
196
 
197
  ## 🙏 Acknowledgments
198
 
199
- BibGuard uses multiple data sources:
200
- - arXiv API
201
- - CrossRef API
202
- - Semantic Scholar API
203
- - DBLP API
204
- - OpenAlex API
205
- - Google Scholar (via scholarly)
206
 
207
  ---
208
 
 
11
 
12
  # BibGuard: Bibliography & LaTeX Quality Auditor
13
 
14
+ **BibGuard** is a comprehensive quality-assurance tool for academic papers. It validates every bibliography entry against real-world databases, checks LaTeX submission quality, flags retracted DOIs and broken URLs, and uses an LLM (optional) to verify that cited papers actually support your claims.
15
 
16
+ AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and produces a single, beautiful, self-contained HTML report you can open offline.
17
 
18
  ## 🛡 Why BibGuard?
19
 
20
+ - **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
21
+ - **🚫 Catch Retractions**: Detect references to papers that have been retracted or are under "expression of concern"
22
+ - **🔗 Detect Broken URLs**: HEAD-check `entry.url` to find dead links before reviewers do
23
+ - **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, double-blind compliance, AI-text artifacts
24
+ - **🔒 Safe & Non-Destructive**: Your original files are **never modified** only reports are generated
25
+ - **🧠 Contextual Relevance** *(optional, with LLM)*: Score each citation 1-5 and tag its role (baseline/method/dataset/counterexample/survey/motivation/other)
26
+ - **⚡ Re-runs are fast**: SQLite-backed HTTP cache + auto-retry mean the second run on the same paper completes in seconds
27
 
28
  ## 🚀 Features
29
 
30
  ### Bibliography Validation
31
+ - **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
32
+ - **🚫 Retraction Detection**: Flags retracted/withdrawn DOIs via CrossRef's `update-to` relation
33
+ - **🔗 URL Liveness Check**: Optional HEAD-then-GET check on every `entry.url`
34
+ - **📊 Preprint Detection**: Warns if >50% of references are preprints, and suggests published versions when arXiv records them
35
+ - **👀 Usage Analysis**: Highlights missing citations and unused bib entries
36
+ - **👯 Duplicate Detection**: Identifies duplicate entries with fuzzy matching
37
+ - **🤖 AI Relevance + Role Tagging** *(optional)*: 1-5 relevance score plus citation role classification
38
 
39
  ### LaTeX Quality Checks
40
+ - **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
41
+ - **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
42
+ - **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology — augmentable via project glossary
43
+ - **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
44
+ - **🔠 Acronym Validation**: Ensures acronyms are defined before use, with a project-glossary skip list
45
+ - **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
46
+ - **📅 Citation Age**: Flags references older than 30 years
47
+ - **🎓 Conference Templates**: Mandatory-section and style-package checks for ACL, EMNLP, NAACL, CVPR, ICCV, ECCV, NeurIPS, ICML, ICLR
48
+
49
+ ### Outputs
50
+ - 📄 **Markdown reports** — bibliography validation + LaTeX quality issues
51
+ - 🌐 **Self-contained HTML** — dark mode, full-text search, per-section severity filters, inline highlighting of the offending span on each LaTeX issue. Opens offline, no server required
52
+ - 🤖 **JSON** for CI / scripts / custom dashboards
53
+ - 🧹 **Cleaned `.bib`** containing only entries actually cited in the paper
54
 
55
  ## 📦 Installation
56
 
 
68
  python main.py --init
69
  ```
70
 
71
+ This creates `config.yaml`. Edit it to point at your `.bib` and `.tex` files.
72
 
73
+ #### Single File Mode
 
74
  ```yaml
75
  files:
76
  bib: "paper.bib"
 
78
  output_dir: "bibguard_output"
79
  ```
80
 
81
+ #### Directory Scan Mode
82
+ For projects with multiple `.tex` and `.bib` files:
83
  ```yaml
84
  files:
85
  input_dir: "./my_project_dir"
86
  output_dir: "bibguard_output"
87
  ```
88
 
89
+ ### 2. Run a Check
90
 
91
  ```bash
92
+ python main.py # full check using config.yaml / bibguard.yaml
93
+ python main.py --quick # local-only checks (no network, instant)
94
+ python main.py --format json,html # pick output formats
95
+ python main.py --verbose # DEBUG logs to stderr
96
+ python main.py --config my.yaml # custom config path
97
+ python main.py --list-templates # list conference templates
98
  ```
99
 
100
+ **Default outputs** (in `bibguard_output/`):
101
+ - `report.html` — single self-contained HTML, opens offline, dark-mode aware
102
+ - `report.json` — full machine-readable dump (only when `json` is in `output.formats`)
103
+ - `bibliography_report.md` bibliography validation, with corroboration notes
104
+ - `latex_quality_report.md` LaTeX quality issues, errors / warnings / suggestions, full line content with the offending span bolded
105
+ - `<bibname>_only_used.bib` — clean bibliography of cited entries only
106
 
107
  ## 🛠 Configuration
108
 
109
+ `bibguard.yaml` (or `config.yaml`) contains the following sections:
110
 
111
  ```yaml
112
+ files:
113
+ bib: "paper.bib"
114
+ tex: "paper.tex"
115
+ output_dir: "bibguard_output"
116
+
117
+ network:
118
+ contact_email: "" # used in polite-pool User-Agent for arXiv/CrossRef/OpenAlex
119
+ cache_enabled: true # local SQLite cache for HTTP responses (~/.cache/bibguard)
120
+ cache_ttl_hours: 24
121
+ retry_total: 5 # auto-retry on 429/5xx with exponential backoff
122
+ retry_backoff_factor: 1.5
123
+
124
+ template: "" # acl | emnlp | naacl | cvpr | iccv | eccv | neurips | icml | iclr
125
+
126
  bibliography:
127
+ check_metadata: true # verify against online databases (slow on first run, fast on repeats)
128
+ check_usage: true # find unused entries / missing citations
129
+ check_duplicates: true
130
+ check_preprint_ratio: true # warn if >50% of references are preprints
131
  check_relevance: false # LLM-based relevance check (requires API key)
132
 
133
+ submission_extra:
134
+ url_liveness: false # HEAD-check every entry.url field (slow)
135
+ retraction: true # flag retracted DOIs via CrossRef
136
+
137
+ submission: # 11 LaTeX checkers — toggle each independently
138
+ caption: true
139
+ reference: true
140
+ formatting: true
141
+ equation: true
142
+ ai_artifacts: true
143
+ sentence: true
144
+ consistency: true
145
+ acronym: true
146
+ number: true
147
+ citation_quality: true
148
+ anonymization: true
149
+
150
+ # Project glossary feeds the consistency / acronym checkers.
151
+ glossary:
152
+ preferred:
153
+ - "Transformer"
154
+ - "fine-tuning"
155
+ acronyms:
156
+ NLP: "Natural Language Processing"
157
+ LLM: "Large Language Model"
158
+
159
+ llm:
160
+ backend: "gemini" # gemini | openai | anthropic | deepseek | ollama | vllm
161
+ model: "" # leave empty for sensible default per backend
162
+ api_key: "" # PREFER env var: $GEMINI_API_KEY / $OPENAI_API_KEY / etc.
163
+
164
+ output:
165
+ quiet: false
166
+ minimal_verified: false
167
+ formats: [markdown, html] # any of: markdown, html, json
168
  ```
169
 
170
+ ## 🤖 LLM-Based Relevance + Role Tagging
171
 
172
+ When `bibliography.check_relevance` is `true`, BibGuard sends each citation's surrounding context plus the cited paper's abstract to your chosen LLM. The model returns a 1-5 relevance score, an `is_relevant` boolean, a one-sentence explanation, and a **citation role**:
173
 
174
+ - `baseline` — cited as a comparison/baseline
175
+ - `method` — cited paper introduces a method this one builds on
176
+ - `dataset` — provides a dataset/benchmark used here
177
+ - `counterexample` — cited to argue against
178
+ - `survey` — cited as a survey/overview
179
+ - `motivation` — cited to motivate the problem
180
+ - `other`
181
 
182
+ **Supported backends**: Gemini, OpenAI, Anthropic, DeepSeek, Ollama (local), vLLM (custom endpoint).
183
+
184
+ **API keys**: read from environment variables by convention — `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `DEEPSEEK_API_KEY`. Set them in your shell rather than committing `api_key:` to `bibguard.yaml`.
185
+
186
+ ## 🌐 Web UI
187
+
188
+ ```bash
189
+ python app.py
190
  ```
191
 
192
+ Opens at `http://localhost:7860`. The web UI mirrors the CLI but with a streaming status panel and three presets:
193
+
194
+ - **Quick** — local checks only, no network, instant
195
+ - **Standard** — local + retraction lookup (CrossRef)
196
+ - **Strict** adds multi-source metadata fetch + URL liveness (slow on first run; subsequent runs are cached)
197
+
198
+ The toolbar fits in one row: file uploads, preset chips, and Run / Stop. Per-check overrides live in the **Advanced** accordion. The report renders inline as a self-contained iframe so the page stays stable while entries stream in. Downloads (HTML, Markdown bib, JSON, cleaned `.bib`, `bibguard.log`) appear in the **Downloads** accordion below.
199
+
200
+ Set `BIBGUARD_CONTACT_EMAIL=you@example.com` in your shell to use a real contact in the polite-pool User-Agent.
201
+
202
+ ## 🪝 Pre-commit Hook
203
+
204
+ To run BibGuard automatically before each commit that touches `.tex` or `.bib`:
205
 
 
206
  ```bash
207
+ cd /path/to/your-paper-repo
208
+ bash /path/to/BibGuard/scripts/install-hook.sh
209
  ```
210
 
211
+ Skip the hook for one commit with `git commit --no-verify`.
212
+
213
  ## 📝 Understanding Reports
214
 
215
+ ### Self-Contained HTML (`report.html`)
216
+ The recommended output. Single file, no external assets, dark-mode aware. Includes:
217
+ - Three tabs: **Bibliography** · **LaTeX Quality** · **Retractions / URLs**
218
+ - **Per-section filter chips** bibliography filters by Verified / Unverified / Unused; LaTeX quality filters by Errors / Warnings / Info
219
+ - **Full-text search** across titles, authors, keys, and messages — works inside the active tab
220
+ - **Inline span highlighting** — for LaTeX issues that come from a regex (e.g., `\cite{}` without `~`), the offending substring is wrapped in `<mark>` so you can see exactly *where* in the line to look
221
+ - **Honest empty states** — Retractions / URL liveness panels report how many entries actually carried a `doi=` / `url=` field, so an empty result no longer looks like the check failed silently
222
+ - Theme toggle that overrides system preference
223
 
224
+ ### Markdown Reports
225
+ Two files for granular review and code review tooling:
226
+ - `bibliography_report.md` — every entry with metadata-match status, including positive **corroboration notes** when a second source agreed
227
+ - `latex_quality_report.md` issues grouped by checker and severity, full line content with the offending span bolded
 
228
 
229
+ ### JSON Output
230
+ Machine-readable dump for CI integration. Top-level keys: `meta`, `summary`, `entries`, `submission_results`, `retractions`, `url_findings`, `duplicates`, `missing_citations`.
231
 
232
  ## 🧐 Understanding Mismatches
233
 
234
  BibGuard is strict, but false positives happen:
235
 
236
+ 1. **Year Discrepancy (±1 Year)** — preprint vs. official publication. Verify which version you intend to cite.
237
+ 2. **Author List Variations** different databases truncate large author lists differently. Check primary authors.
238
+ 3. **Venue Name Differences** abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems"). Both usually correct.
239
+ 4. **Non-Academic Sources** — blogs and documentation aren't indexed by academic databases. Verify URL and title manually.
 
 
 
 
 
 
 
240
 
241
+ ## 🔧 Performance Notes
 
 
242
 
243
+ - **First run** with `check_metadata: true` on ~100 entries: 1-3 minutes (rate-limited by arXiv/CrossRef).
244
+ - **Re-runs**: seconds, thanks to the SQLite HTTP cache at `~/.cache/bibguard/http_cache.sqlite` (TTL 24h by default).
245
+ - **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
246
+ - **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
 
 
 
247
 
248
  ## 🤝 Contributing
249
 
250
+ Contributions welcome. Open an issue or pull request.
251
 
252
  ## 🙏 Acknowledgments
253
 
254
+ BibGuard uses the following data sources:
255
+ - [arXiv API](https://info.arxiv.org/help/api/index.html)
256
+ - [CrossRef REST API](https://api.crossref.org)
257
+ - [Semantic Scholar Graph API](https://api.semanticscholar.org)
258
+ - [DBLP API](https://dblp.org/faq/How+to+use+the+dblp+search+API.html)
259
+ - [OpenAlex API](https://docs.openalex.org)
260
+ - Google Scholar (via scraping; rate-limited)
261
 
262
  ---
263
 
app.py CHANGED
@@ -1,927 +1,1243 @@
1
  #!/usr/bin/env python3
2
  """
3
- BibGuard Gradio Web Application
4
 
5
- A web interface for checking bibliography and LaTeX quality.
 
 
 
 
6
  """
7
- import gradio as gr
 
 
 
 
8
  import tempfile
9
- import shutil
10
  from pathlib import Path
11
- from typing import Optional, Tuple
12
- import base64
13
 
14
  from src.parsers import BibParser, TexParser
15
- from src.fetchers import ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher, OpenAlexFetcher, DBLPFetcher
 
 
 
16
  from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
17
  from src.report.generator import ReportGenerator, EntryReport
18
- from src.config.yaml_config import BibGuardConfig, FilesConfig, BibliographyConfig, SubmissionConfig, OutputConfig, WorkflowStep
19
- from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
 
 
20
  from src.checkers import CHECKER_REGISTRY
21
- from src.report.line_report import LineByLineReportGenerator
 
 
 
 
22
  from app_helper import fetch_and_compare_with_workflow
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Custom CSS for better Markdown rendering
26
  CUSTOM_CSS = """
27
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
28
 
29
- * {
30
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
31
- }
32
- """
33
 
34
- WELCOME_HTML = """
35
- <div class="scrollable-report-area">
36
- <div class="report-card" style="max-width: 800px; margin: 0 auto;">
37
- <div class="card-header">
38
- <h3 class="card-title" style="font-size: 1.5em;">👋 Welcome to BibGuard</h3>
39
- </div>
40
- <div class="card-content" style="line-height: 1.6; color: #374151;">
41
- <p style="font-size: 1.1em; margin-bottom: 24px;">
42
- Ensure your academic paper is flawless. Upload your <code>.bib</code> and <code>.tex</code> files on the left and click <strong>"Check Now"</strong>.
43
- </p>
44
-
45
- <div style="display: grid; gap: 20px; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));">
46
- <div style="background: #fefce8; padding: 16px; border-radius: 8px; border: 1px solid #fde047;">
47
- <strong style="color: #854d0e; display: block; margin-bottom: 8px;">⚠️ Metadata Check Defaults</strong>
48
- "🔍 Metadata" is <strong>disabled by default</strong>. It verifies your entries against ArXiv/DBLP/Crossref but takes time (1-3 mins) to fetch data. Enable it if you want strict verification.
49
- </div>
50
-
51
- <div style="background: #eff6ff; padding: 16px; border-radius: 8px; border: 1px solid #bfdbfe;">
52
- <strong style="color: #1e40af; display: block; margin-bottom: 8px;">🚀 Go Pro with Local Version</strong>
53
- LLM-based context relevance checking (is this citation actually relevant?) is excluded here. Clone the <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="color: #2563eb; text-decoration: underline; font-weight: 600;">GitHub repo</a> to use the full power with your API key.
54
- </div>
55
- </div>
56
-
57
- <h4 style="margin: 24px 0 12px 0; color: #111827; font-size: 1.1em;">📊 Understanding Your Reports</h4>
58
- <div style="display: grid; gap: 12px;">
59
- <div style="display: flex; gap: 12px; align-items: baseline;">
60
- <span style="background: #e0e7ff; color: #3730a3; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📚 Bibliography</span>
61
- <span>Validates metadata fields, detects duplicates, and checks citation counts.</span>
62
- </div>
63
- <div style="display: flex; gap: 12px; align-items: baseline;">
64
- <span style="background: #dcfce7; color: #166534; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📝 LaTeX Quality</span>
65
- <span>Syntax check, caption validation, acronym consistency, and style suggestions.</span>
66
- </div>
67
- <div style="display: flex; gap: 12px; align-items: baseline;">
68
- <span style="background: #f3f4f6; color: #4b5563; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📋 Line-by-Line</span>
69
- <span>Maps every issue found directly to the line number in your source file.</span>
70
- </div>
71
- </div>
72
- </div>
73
- </div>
74
- </div>
75
- """
76
-
77
- CUSTOM_CSS += """
78
- /* Global Reset */
79
- body, gradio-app {
80
- overflow: hidden !important; /* Prevent double scrollbars on the page */
81
- }
82
 
83
  .gradio-container {
84
- max-width: none !important;
 
 
 
85
  width: 100% !important;
86
- /* height: 100vh !important; <-- Removed to prevent iframe infinite loop */
87
- padding: 0 !important;
88
- margin: 0 !important;
89
  }
90
 
91
- /* Header Styling */
92
- .app-header {
93
- padding: 20px;
94
- background: white;
95
  border-bottom: 1px solid #e5e7eb;
 
96
  }
97
 
98
- /* Sidebar Styling */
99
- .app-sidebar {
100
- height: auto !important;
101
- max-height: calc(100vh - 100px) !important;
102
- overflow-y: auto !important;
103
- padding: 20px !important;
104
- border-right: 1px solid #e5e7eb;
 
 
 
105
  }
106
-
107
- /* Main Content Area */
108
- .app-content {
109
- height: auto !important;
110
- max-height: calc(100vh - 100px) !important;
111
- padding: 0 !important;
 
 
 
 
 
 
 
 
112
  }
113
-
114
- /* The Magic Scroll Container - Clean and Explicit */
115
- .scrollable-report-area {
116
- /* Fixed height relative to viewport can cause loops in Spaces */
117
- max-height: 800px !important;
118
- height: auto !important;
119
- min-height: 500px !important;
120
- overflow-y: auto !important;
121
- padding: 24px;
122
- background-color: #f9fafb;
123
- border: 1px solid #e5e7eb;
124
- border-radius: 8px;
125
- margin-top: 10px;
 
126
  }
127
-
128
- /* Report Card Styling */
129
- .report-card {
130
- background: white;
131
- border-radius: 12px;
132
- padding: 24px;
133
- margin-bottom: 16px; /* Spacing between cards */
134
- box-shadow: 0 1px 3px rgba(0,0,0,0.1);
135
- border: 1px solid #e5e7eb;
136
- transition: transform 0.2s, box-shadow 0.2s;
137
  }
138
 
139
- .report-card:hover {
140
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
141
- transform: translateY(-2px);
 
 
 
 
 
142
  }
143
-
144
- /* Card Internals */
145
- .card-header {
146
- display: flex;
147
- justify-content: space-between;
148
- align-items: flex-start;
149
- margin-bottom: 16px;
150
- padding-bottom: 16px;
151
- border-bottom: 1px solid #f3f4f6;
 
152
  }
153
-
154
- .card-title {
155
- font-size: 1.1em;
156
- font-weight: 600;
157
- color: #111827;
158
- margin: 0 0 4px 0;
159
  }
160
-
161
- .card-subtitle {
162
- font-size: 0.9em;
163
- color: #6b7280;
164
- font-family: monospace;
 
 
 
 
 
165
  }
166
-
167
- .card-content {
168
- font-size: 0.95em;
169
- color: #374151;
170
- line-height: 1.5;
 
 
 
 
 
 
 
 
 
 
171
  }
172
-
173
- /* Badges */
174
- .badge {
175
- display: inline-flex;
176
- align-items: center;
177
- padding: 4px 10px;
178
- border-radius: 9999px;
179
- font-size: 0.8em;
180
- font-weight: 500;
 
 
 
 
 
 
 
 
 
 
 
 
181
  }
182
-
183
- .badge-success { background-color: #dcfce7; color: #166534; }
184
- .badge-warning { background-color: #fef9c3; color: #854d0e; }
185
- .badge-error { background-color: #fee2e2; color: #991b1b; }
186
- .badge-info { background-color: #dbeafe; color: #1e40af; }
187
- .badge-neutral { background-color: #f3f4f6; color: #4b5563; }
188
-
189
- /* Stats Grid */
190
- .stats-container {
191
- display: grid;
192
- grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
193
- gap: 16px;
194
- margin-bottom: 24px;
195
  }
196
 
197
- .stat-card {
198
- padding: 16px;
199
- border-radius: 12px;
200
- color: white;
201
- text-align: center;
202
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
 
 
 
 
 
203
  }
204
-
205
- .stat-value { font-size: 1.8em; font-weight: 700; }
206
- .stat-label { font-size: 0.9em; opacity: 0.9; }
207
-
208
- /* Detail Grid - Flexbox for better filling */
209
- .detail-grid {
210
- display: flex;
211
- flex-wrap: wrap;
212
- gap: 12px;
213
- margin-bottom: 16px;
214
- width: 100%;
 
 
 
 
 
215
  }
216
 
217
- .detail-item {
218
- background: #f9fafb;
219
- padding: 10px 12px;
220
- border-radius: 8px;
221
- border: 1px solid #f3f4f6;
222
-
223
- /* Flex sizing: grow, shrink, min-basis */
224
- flex: 1 1 160px;
225
- min-width: 0; /* Important for word-break to work in flex children */
226
-
227
- /* Layout control */
228
- display: flex;
229
- flex-direction: column;
230
-
231
- /* Height constraint to prevent one huge card from stretching the row */
232
- max-height: 100px;
233
- overflow-y: auto;
234
  }
235
 
236
- /* Custom scrollbar for detail items */
237
- .detail-item::-webkit-scrollbar {
238
- width: 4px;
 
 
 
 
 
 
 
 
239
  }
240
- .detail-item::-webkit-scrollbar-thumb {
241
- background-color: #d1d5db;
242
- border-radius: 4px;
 
243
  }
244
-
245
- .detail-label {
246
- font-size: 0.75em;
247
- color: #6b7280;
248
- text-transform: uppercase;
249
- letter-spacing: 0.05em;
250
- margin-bottom: 2px;
251
- position: sticky;
252
- top: 0;
253
- background: #f9fafb; /* Maintain bg on scroll */
254
- z-index: 1;
255
  }
 
256
 
257
- .detail-value {
258
- font-weight: 500;
259
- color: #1f2937;
260
- font-size: 0.9em;
261
- line-height: 1.4;
262
- word-break: break-word; /* Fix overflow */
263
- overflow-wrap: break-word;
264
- } border: 1px solid #e5e7eb;
265
- transition: all 0.2s;
266
  }
267
 
268
- .report-card:hover {
269
- box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
 
 
 
 
 
 
 
 
 
 
270
  }
271
-
272
- /* Card Header */
273
- .card-header {
 
 
 
 
 
 
 
 
 
 
 
274
  display: flex;
275
- justify-content: space-between;
276
- align-items: flex-start;
277
- margin-bottom: 12px;
278
- border-bottom: 1px solid #f3f4f6;
279
- padding-bottom: 12px;
280
  }
281
-
282
- .card-title {
283
- font-size: 1.1em;
284
  font-weight: 600;
285
- color: #1f2937;
286
- margin: 0;
287
- }
288
-
289
- .card-subtitle {
290
- font-size: 0.9em;
291
- color: #6b7280;
292
- margin-top: 4px;
293
- }
294
-
295
- /* Status Badges */
296
- .badge {
297
  display: inline-flex;
298
  align-items: center;
299
- padding: 4px 10px;
300
- border-radius: 9999px;
301
- font-size: 0.8em;
302
- font-weight: 500;
303
  }
304
-
305
- .badge-success { background-color: #dcfce7; color: #166534; }
306
- .badge-warning { background-color: #fef9c3; color: #854d0e; }
307
- .badge-error { background-color: #fee2e2; color: #991b1b; }
308
- .badge-info { background-color: #dbeafe; color: #1e40af; }
309
- .badge-neutral { background-color: #f3f4f6; color: #374151; }
310
-
311
- /* Content Styling */
312
- .card-content {
313
- font-size: 15px;
314
- color: #374151;
315
- line-height: 1.6;
316
  }
317
-
318
- .card-content code {
319
- background-color: #f3f4f6;
320
- padding: 2px 6px;
321
  border-radius: 4px;
322
- font-family: monospace;
323
- font-size: 0.9em;
324
- color: #c2410c;
325
  }
326
-
327
- /* Grid for details */
328
- .detail-grid {
329
- display: grid;
330
- grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
331
  gap: 12px;
332
- margin-top: 12px;
 
333
  }
334
-
335
- .detail-item {
336
- background: #f9fafb;
337
- padding: 10px;
338
- border-radius: 6px;
 
 
 
 
 
 
339
  }
 
340
 
341
- .detail-label {
342
- font-size: 0.8em;
343
- color: #6b7280;
344
- text-transform: uppercase;
345
- letter-spacing: 0.05em;
 
 
 
 
 
 
 
346
  }
347
 
348
- .detail-value {
349
- font-weight: 500;
350
- color: #111827;
 
 
 
 
 
 
351
  }
352
-
353
- /* Summary Stats */
354
- .stats-container {
355
- display: grid;
356
- grid-template-columns: repeat(3, 1fr);
357
- gap: 16px;
358
- margin-bottom: 24px;
 
 
 
 
 
359
  }
360
 
361
- .stat-card {
362
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
363
- color: white;
364
- padding: 20px;
365
- border-radius: 12px;
366
  text-align: center;
367
- box-shadow: 0 4px 6px rgba(102, 126, 234, 0.25);
 
 
 
 
368
  }
369
-
370
- .stat-value {
371
- font-size: 2em;
372
- font-weight: 700;
373
- }
374
-
375
- .stat-label {
376
- font-size: 0.9em;
377
- opacity: 0.9;
378
- margin-top: 4px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  }
 
380
 
381
- /* Button styling */
382
- .primary-btn {
383
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
384
- border: none !important;
385
- font-weight: 600 !important;
386
- }
387
 
388
- /* Tab styling */
389
- .tab-nav button {
390
- font-weight: 500 !important;
391
- font-size: 15px !important;
392
- }
 
 
 
 
 
 
393
  """
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
  def create_config_from_ui(
397
- check_metadata: bool,
398
- check_usage: bool,
399
- check_duplicates: bool,
400
- check_preprint_ratio: bool,
401
- caption: bool,
402
- reference: bool,
403
- formatting: bool,
404
- equation: bool,
405
- ai_artifacts: bool,
406
- sentence: bool,
407
- consistency: bool,
408
- acronym: bool,
409
- number: bool,
410
- citation_quality: bool,
411
- anonymization: bool
412
  ) -> BibGuardConfig:
413
- """Create a BibGuardConfig from UI settings."""
414
  config = BibGuardConfig()
415
-
416
  config.bibliography = BibliographyConfig(
417
  check_metadata=check_metadata,
418
  check_usage=check_usage,
419
  check_duplicates=check_duplicates,
420
  check_preprint_ratio=check_preprint_ratio,
421
- check_relevance=False # Disabled for web
422
  )
423
-
424
  config.submission = SubmissionConfig(
425
- caption=caption,
426
- reference=reference,
427
- formatting=formatting,
428
- equation=equation,
429
- ai_artifacts=ai_artifacts,
430
- sentence=sentence,
431
- consistency=consistency,
432
- acronym=acronym,
433
- number=number,
434
- citation_quality=citation_quality,
435
- anonymization=anonymization
436
  )
437
-
438
  config.output = OutputConfig(quiet=True, minimal_verified=False)
439
-
440
  return config
441
 
442
 
443
- def generate_bibliography_html(report_gen: ReportGenerator, entries: list) -> str:
444
- """Generate HTML content for bibliography report."""
445
- html = ['<div class="scrollable-report-area">']
446
-
447
- # 1. Summary Stats
448
- total = len(entries)
449
- verified = sum(1 for e in report_gen.entries if e.comparison and e.comparison.is_match)
450
- used = sum(1 for e in report_gen.entries if e.usage and e.usage.is_used)
451
-
452
- html.append('<div class="stats-container">')
453
- html.append(f'<div class="stat-card"><div class="stat-value">{total}</div><div class="stat-label">Total Entries</div></div>')
454
- html.append(f'<div class="stat-card"><div class="stat-value">{verified}</div><div class="stat-label">Verified</div></div>')
455
- html.append(f'<div class="stat-card"><div class="stat-value">{used}</div><div class="stat-label">Used in Text</div></div>')
456
- html.append('</div>')
457
-
458
- # 2. Entries
459
- for report in report_gen.entries:
460
- entry = report.entry
461
- status_badges = []
462
-
463
- # Metadata Status
464
- if report.comparison:
465
- if report.comparison.is_match:
466
- status_badges.append('<span class="badge badge-success">✓ Verified</span>')
467
- if report.comparison.source:
468
- status_badges.append(f'<span class="badge badge-info">{report.comparison.source.upper()}</span>')
469
- else:
470
- status_badges.append('<span class="badge badge-error">⚠ Metadata Mismatch</span>')
471
- else:
472
- status_badges.append('<span class="badge badge-neutral">No Metadata Check</span>')
473
-
474
- # Usage Status
475
- if report.usage:
476
- if report.usage.is_used:
477
- status_badges.append(f'<span class="badge badge-success">Used: {report.usage.usage_count}x</span>')
478
- else:
479
- status_badges.append('<span class="badge badge-warning">Unused</span>')
480
-
481
- # Build Card
482
- html.append(f'''
483
- <div class="report-card">
484
- <div class="card-header">
485
- <div>
486
- <h3 class="card-title">{entry.title or "No Title"}</h3>
487
- <div class="card-subtitle">{entry.key} • {entry.year} • {entry.entry_type}</div>
488
- </div>
489
- <div style="display: flex; gap: 8px;">
490
- {" ".join(status_badges)}
491
- </div>
492
- </div>
493
-
494
- <div class="card-content">
495
- <div class="detail-grid">
496
- {
497
- (lambda e: "".join([
498
- f'<div class="detail-item"><div class="detail-label">{k}</div><div class="detail-value">{v}</div></div>'
499
- for k, v in filter(None, [
500
- ("Authors", e.author or "N/A"),
501
- ("Venue", e.journal or e.booktitle or e.publisher or "N/A"),
502
- ("DOI", e.doi) if e.doi else None,
503
- ("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None,
504
- ("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None,
505
- ("URL", f'<a href="{e.url}" target="_blank" style="text-decoration:underline;">Link</a>') if e.url else None
506
- ])
507
- ]))(entry)
508
- }
509
- </div>
510
- ''')
511
-
512
- # Add issues if any
513
- issues = []
514
- if report.comparison and not report.comparison.is_match:
515
- # Add main message derived from match status
516
- if report.comparison.issues:
517
- for issue in report.comparison.issues:
518
- issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• {issue}</div>')
519
- else:
520
- issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• Verification failed</div>')
521
-
522
- if issues:
523
- html.append('<div style="margin-top: 16px; padding-top: 12px; border-top: 1px solid #eee;">')
524
- html.append("".join(issues))
525
- html.append('</div>')
526
-
527
- html.append('</div></div>') # Close card-content and report-card
528
-
529
- html.append('</div>') # Close container
530
- return "".join(html)
531
-
532
- def generate_latex_html(results: list) -> str:
533
- """Generate HTML for LaTeX quality check."""
534
- from src.checkers import CheckSeverity
535
-
536
- html = ['<div class="scrollable-report-area">']
537
-
538
- # Stats
539
- errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR)
540
- warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING)
541
- infos = sum(1 for r in results if r.severity == CheckSeverity.INFO)
542
-
543
- html.append('<div class="stats-container">')
544
- html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #ef4444 0%, #b91c1c 100%);"><div class="stat-value">{errors}</div><div class="stat-label">Errors</div></div>')
545
- html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);"><div class="stat-value">{warnings}</div><div class="stat-label">Warnings</div></div>')
546
- html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);"><div class="stat-value">{infos}</div><div class="stat-label">Suggestions</div></div>')
547
- html.append('</div>')
548
-
549
- if not results:
550
- html.append('<div class="report-card"><div class="card-content" style="text-align: center; padding: 40px; color: #166534; font-size: 1.2em;">✅ No issues found in LaTeX code!</div></div>')
551
- else:
552
- # Group by Checker
553
- results.sort(key=lambda x: x.checker_name)
554
- current_checker = None
555
-
556
- for result in results:
557
- badge_class = "badge-neutral"
558
- if result.severity == CheckSeverity.ERROR: badge_class = "badge-error"
559
- elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning"
560
- elif result.severity == CheckSeverity.INFO: badge_class = "badge-info"
561
-
562
- html.append(f'''
563
- <div class="report-card">
564
- <div class="card-header">
565
- <div>
566
- <h3 class="card-title">{result.checker_name}</h3>
567
- <div class="card-subtitle">Line {result.line_number}</div>
568
- </div>
569
- <span class="badge {badge_class}">{result.severity.name}</span>
570
- </div>
571
- <div class="card-content">
572
- {result.message}
573
- {f'<div style="margin-top: 8px; background: #f3f4f6; padding: 8px; border-radius: 4px; font-family: monospace;">{result.line_content}</div>' if result.line_content else ''}
574
- {f'<div style="margin-top: 8px; color: #166534;">💡 Suggestion: {result.suggestion}</div>' if result.suggestion else ''}
575
- </div>
576
- </div>
577
- ''')
578
-
579
- html.append('</div>')
580
- return "".join(html)
581
-
582
- def generate_line_html(content: str, results: list) -> str:
583
- """Generate HTML for Line-by-Line report."""
584
- # Build a dictionary of line_number -> list of issues
585
- issues_by_line = {}
586
- for r in results:
587
- if r.line_number not in issues_by_line:
588
- issues_by_line[r.line_number] = []
589
- issues_by_line[r.line_number].append(r)
590
-
591
- lines = content.split('\n')
592
-
593
- html = ['<div class="scrollable-report-area">']
594
-
595
- html.append('<div class="report-card"><div class="card-content">Issues are mapped to specific lines below.</div></div>')
596
-
597
- for i, line in enumerate(lines, 1):
598
- if i in issues_by_line:
599
- # Highlight this line
600
- line_issues = issues_by_line[i]
601
-
602
- html.append(f'''
603
- <div class="report-card" style="border-left: 4px solid #ef4444; padding: 12px;">
604
- <div style="font-family: monospace; color: #6b7280; font-size: 0.9em; margin-bottom: 4px;">Line {i}</div>
605
- <div style="font-family: monospace; background: #fee2e2; padding: 4px; border-radius: 4px; overflow-x: auto; white-space: pre;">{line}</div>
606
- <div style="margin-top: 8px;">
607
- ''')
608
-
609
- for issue in line_issues:
610
- html.append(f'<div style="color: #991b1b; font-size: 0.95em; margin-top: 4px;">• {issue.message}</div>')
611
-
612
- html.append('</div></div>')
613
-
614
- html.append('</div>')
615
- return "".join(html)
616
 
617
 
 
 
 
 
 
 
 
 
 
 
618
 
619
 
 
 
 
 
 
 
 
 
 
620
  def run_check(
621
- bib_file,
622
- tex_file,
623
- check_metadata: bool,
624
- check_usage: bool,
625
- check_duplicates: bool,
626
- check_preprint_ratio: bool,
627
- caption: bool,
628
- reference: bool,
629
- formatting: bool,
630
- equation: bool,
631
- ai_artifacts: bool,
632
- sentence: bool,
633
- consistency: bool,
634
- acronym: bool,
635
- number: bool,
636
- citation_quality: bool,
637
- anonymization: bool,
638
- progress=gr.Progress()
639
- ) -> Tuple[str, str, str]:
640
- """Run BibGuard checks and return three reports."""
641
-
642
- if bib_file is None or tex_file is None:
643
- return (
644
- "⚠️ Please upload both `.bib` and `.tex` files.",
645
- "⚠️ Please upload both `.bib` and `.tex` files.",
646
- "⚠️ Please upload both `.bib` and `.tex` files."
647
  )
648
-
649
- try:
650
- # Create config from UI
651
- config = create_config_from_ui(
652
- check_metadata, check_usage, check_duplicates, check_preprint_ratio,
653
- caption, reference, formatting, equation, ai_artifacts,
654
- sentence, consistency, acronym, number, citation_quality, anonymization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  )
656
-
657
- # Get file paths from uploaded files
658
- bib_path = bib_file.name
659
- tex_path = tex_file.name
660
-
661
- # Read tex content for checkers
662
- tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace')
663
-
664
- # Parse files
665
- bib_parser = BibParser()
666
- entries = bib_parser.parse_file(bib_path)
667
-
668
- tex_parser = TexParser()
669
- tex_parser.parse_file(tex_path)
670
-
671
- bib_config = config.bibliography
672
-
673
- # Initialize components
674
- arxiv_fetcher = None
675
- crossref_fetcher = None
676
- semantic_scholar_fetcher = None
677
- openalex_fetcher = None
678
- dblp_fetcher = None
679
- comparator = None
680
- usage_checker = None
681
- duplicate_detector = None
682
-
683
- if bib_config.check_metadata:
684
- arxiv_fetcher = ArxivFetcher()
685
- semantic_scholar_fetcher = SemanticScholarFetcher()
686
- openalex_fetcher = OpenAlexFetcher()
687
- dblp_fetcher = DBLPFetcher()
688
- crossref_fetcher = CrossRefFetcher()
689
- comparator = MetadataComparator()
690
-
691
- if bib_config.check_usage:
692
- usage_checker = UsageChecker(tex_parser)
693
-
694
- if bib_config.check_duplicates:
695
- duplicate_detector = DuplicateDetector()
696
-
697
- # Initialize report generator
698
- report_gen = ReportGenerator(
699
- minimal_verified=False,
700
- check_preprint_ratio=bib_config.check_preprint_ratio,
701
- preprint_warning_threshold=bib_config.preprint_warning_threshold
702
  )
703
- report_gen.set_metadata([bib_file.name], [tex_file.name])
704
-
705
- # Run submission quality checks
706
- progress(0.2, desc="Running LaTeX quality checks...")
707
- submission_results = []
708
- enabled_checkers = config.submission.get_enabled_checkers()
709
-
710
- for checker_name in enabled_checkers:
711
- if checker_name in CHECKER_REGISTRY:
712
- checker = CHECKER_REGISTRY[checker_name]()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  results = checker.check(tex_content, {})
714
  for r in results:
715
- r.file_path = tex_file.name
716
  submission_results.extend(results)
717
-
718
- report_gen.set_submission_results(submission_results, None)
719
-
720
- # Check for duplicates
721
- if bib_config.check_duplicates and duplicate_detector:
722
- duplicate_groups = duplicate_detector.find_duplicates(entries)
723
- report_gen.set_duplicate_groups(duplicate_groups)
724
-
725
- # Check missing citations
726
- if bib_config.check_usage and usage_checker:
727
- missing = usage_checker.get_missing_entries(entries)
728
- report_gen.set_missing_citations(missing)
729
-
730
- # Build workflow
731
- workflow_config = get_default_workflow()
732
-
733
- # Process entries
734
- progress(0.3, desc="Processing bibliography entries...")
735
- total_entries = len(entries)
736
-
737
- for i, entry in enumerate(entries):
738
- progress(0.3 + 0.5 * (i / total_entries), desc=f"Checking: {entry.key}")
739
-
740
- # Check usage
741
- usage_result = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  if usage_checker:
743
  usage_result = usage_checker.check_usage(entry)
744
-
745
- # Fetch and compare metadata
746
- comparison_result = None
747
  if bib_config.check_metadata and comparator:
748
  comparison_result = fetch_and_compare_with_workflow(
749
  entry, workflow_config, arxiv_fetcher, crossref_fetcher,
750
- semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
751
  )
752
-
753
- # Create entry report
754
- entry_report = EntryReport(
755
- entry=entry,
756
- comparison=comparison_result,
757
- usage=usage_result,
758
- evaluations=[]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  )
760
- report_gen.add_entry_report(entry_report)
761
-
762
- progress(0.85, desc="Generating structured reports...")
763
-
764
- # Generate Bibliography HTML Report
765
- bib_report = generate_bibliography_html(report_gen, entries)
766
-
767
- # Generate LaTeX Quality HTML Report
768
- latex_report = generate_latex_html(submission_results)
769
-
770
- # Generate Line-by-Line HTML Report
771
- line_report = ""
772
- if submission_results:
773
- line_report = generate_line_html(tex_content, submission_results)
774
- else:
775
- line_report = '<div class="report-container"><div class="report-card"><div class="card-content">No issues to display line-by-line.</div></div></div>'
776
-
777
- progress(1.0, desc="Done!")
778
-
779
- return bib_report, latex_report, line_report
780
-
781
- except Exception as e:
782
- error_msg = f" Error: {str(e)}"
783
- import traceback
784
- error_msg += f"\n\n```\n{traceback.format_exc()}\n```"
785
- return error_msg, error_msg, error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
 
 
 
 
 
 
 
 
 
 
 
 
787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
788
 
789
- def create_app():
790
- """Create and configure the Gradio app."""
791
-
792
- # Load icon as base64
793
- icon_html = ""
794
  try:
795
- icon_path = Path("assets/icon-192.png")
796
  if icon_path.exists():
797
  with open(icon_path, "rb") as f:
798
- encoding = base64.b64encode(f.read()).decode()
799
- icon_html = f'<img src="data:image/png;base64,{encoding}" style="width: 48px; height: 48px; border-radius: 8px;" alt="BibGuard">'
800
- else:
801
- icon_html = '<span style="font-size: 48px;">📚</span>'
802
- except Exception:
803
- icon_html = '<span style="font-size: 48px;">📚</span>'
804
-
805
- with gr.Blocks(title="BibGuard - Bibliography & LaTeX Quality Checker") as app:
806
-
807
- # Header with icon
808
- with gr.Row(elem_classes=["app-header"]):
809
- gr.HTML(f"""
810
- <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 16px;">
811
- {icon_html}
812
- <div>
813
- <h1 style="margin: 0; font-size: 1.8em;">BibGuard</h1>
814
- <p style="margin: 0; color: #666; font-size: 14px;">Bibliography & LaTeX Quality Checker</p>
815
- </div>
816
- </div>
817
- """)
818
-
819
- with gr.Row(elem_classes=["app-body"]):
820
- # Left column: Upload & Settings
821
- with gr.Column(scale=1, min_width=280, elem_classes=["app-sidebar"]):
822
- gr.Markdown("### 📁 Upload Files")
823
-
824
- bib_file = gr.File(
825
- label="Bibliography (.bib)",
826
- file_types=[".bib"],
827
- file_count="single"
 
 
 
 
 
 
 
 
 
 
828
  )
829
-
830
- tex_file = gr.File(
831
- label="LaTeX Source (.tex)",
832
- file_types=[".tex"],
833
- file_count="single"
 
 
834
  )
835
-
836
- # Check options in grid layout
837
- gr.Markdown("#### ⚙️ Options")
838
-
839
- with gr.Row():
840
- check_metadata = gr.Checkbox(label="🔍 Metadata", value=False)
841
- check_usage = gr.Checkbox(label="📊 Usage", value=True)
842
-
843
- with gr.Row():
844
- check_duplicates = gr.Checkbox(label="👯 Duplicates", value=True)
845
- check_preprint_ratio = gr.Checkbox(label="📄 Preprints", value=True)
846
-
847
- with gr.Row():
848
- caption = gr.Checkbox(label="🖼️ Captions", value=True)
849
- reference = gr.Checkbox(label="🔗 References", value=True)
850
-
851
- with gr.Row():
852
- formatting = gr.Checkbox(label="✨ Formatting", value=True)
853
- equation = gr.Checkbox(label="🔢 Equations", value=True)
854
-
855
- with gr.Row():
856
- ai_artifacts = gr.Checkbox(label="🤖 AI Artifacts", value=True)
857
- sentence = gr.Checkbox(label="📝 Sentences", value=True)
858
-
859
- with gr.Row():
860
- consistency = gr.Checkbox(label="🔄 Consistency", value=True)
861
- acronym = gr.Checkbox(label="🔤 Acronyms", value=True)
862
-
863
- with gr.Row():
864
- number = gr.Checkbox(label="🔢 Numbers", value=True)
865
- citation_quality = gr.Checkbox(label="📚 Citations", value=True)
866
-
867
- with gr.Row():
868
- anonymization = gr.Checkbox(label="🎭 Anonymization", value=True)
869
-
870
- run_btn = gr.Button("🔍 Check Now", variant="primary", size="lg")
871
-
872
- gr.HTML("""
873
- <div style="text-align: center; margin-top: 16px;">
874
- <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="text-decoration: none; color: #666; display: inline-flex; align-items: center; gap: 6px;">
875
- <svg height="20" width="20" viewBox="0 0 16 16"><path fill="currentColor" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
876
- GitHub
877
- </a>
878
- <p style="margin: 8px 0 0 0; color: #999; font-size: 12px;">Developed with ❤️ for researchers</p>
879
- </div>
880
- """)
881
-
882
- # Right column: Reports
883
- with gr.Column(scale=4, elem_classes=["app-content"]):
884
- with gr.Tabs():
885
- with gr.Tab("📚 Bibliography Report"):
886
- bib_report = gr.HTML(
887
- value=WELCOME_HTML,
888
- elem_classes=["report-panel"]
889
- )
890
-
891
- with gr.Tab("📝 LaTeX Quality"):
892
- latex_report = gr.HTML(
893
- value=WELCOME_HTML,
894
- elem_classes=["report-panel"]
895
- )
896
-
897
- with gr.Tab("📋 Line-by-Line"):
898
- line_report = gr.HTML(
899
- value=WELCOME_HTML,
900
- elem_classes=["report-panel"]
901
- )
902
-
903
- # Event handling
904
- run_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905
  fn=run_check,
906
  inputs=[
907
- bib_file, tex_file,
908
  check_metadata, check_usage, check_duplicates, check_preprint_ratio,
909
  caption, reference, formatting, equation, ai_artifacts,
910
- sentence, consistency, acronym, number, citation_quality, anonymization
 
911
  ],
912
- outputs=[bib_report, latex_report, line_report]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913
  )
914
-
915
  return app
916
 
917
 
918
- # Create the app
919
  app = create_app()
920
 
 
921
  if __name__ == "__main__":
 
922
  app.launch(
923
- favicon_path="assets/icon-192.png",
924
  show_error=True,
925
  css=CUSTOM_CSS,
926
- theme=gr.themes.Soft()
927
  )
 
1
  #!/usr/bin/env python3
2
  """
3
+ BibGuard Gradio web app — minimalist iframe layout.
4
 
5
+ The right pane embeds the self-contained ``report.html`` produced by
6
+ ``src/report/html_report.py`` via ``<iframe srcdoc=...>``. This makes the
7
+ generated report the single source of truth (per-section filters, full-text
8
+ search, dark mode, inline span highlighting all live inside it) and avoids
9
+ re-rendering the same content inside Gradio with stale styles.
10
  """
11
+ from __future__ import annotations
12
+
13
+ import base64
14
+ import logging
15
+ import os
16
  import tempfile
17
+ import time
18
  from pathlib import Path
19
+
20
+ import gradio as gr
21
 
22
  from src.parsers import BibParser, TexParser
23
+ from src.fetchers import (
24
+ ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher,
25
+ OpenAlexFetcher, DBLPFetcher,
26
+ )
27
  from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
28
  from src.report.generator import ReportGenerator, EntryReport
29
+ from src.config.yaml_config import (
30
+ BibGuardConfig, BibliographyConfig, SubmissionConfig, OutputConfig,
31
+ )
32
+ from src.config.workflow import get_default_workflow
33
  from src.checkers import CHECKER_REGISTRY
34
+ from src.checkers.retraction_checker import RetractionChecker
35
+ from src.checkers.url_checker import URLChecker
36
+ from src.utils import http as http_layer
37
+ from src.utils.logging_setup import setup as setup_logging, capture_run
38
+ from src.utils.validation import validate_bib, validate_tex, format_report
39
  from app_helper import fetch_and_compare_with_workflow
40
 
41
+ LOG_PATH = setup_logging(os.environ.get("BIBGUARD_LOG", "WARNING"))
42
+ logger = logging.getLogger("bibguard.app")
43
+ logger.info("BibGuard app starting (log file: %s)", LOG_PATH)
44
+
45
+ # Configure HTTP layer once at import time.
46
+ http_layer.configure(
47
+ contact_email=os.environ.get("BIBGUARD_CONTACT_EMAIL", ""),
48
+ cache_enabled=True,
49
+ cache_ttl_hours=24,
50
+ retry_total=5,
51
+ retry_backoff_factor=1.5,
52
+ )
53
+
54
+
55
+ # --------------------------------------------------------------------- presets
56
+
57
+ PRESETS = {
58
+ "Quick": {
59
+ "check_metadata": False, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
60
+ "url_liveness": False, "retraction": False,
61
+ "submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
62
+ "ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
63
+ "number": True, "citation_quality": True, "anonymization": True},
64
+ },
65
+ "Standard": {
66
+ "check_metadata": False, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
67
+ "url_liveness": False, "retraction": True,
68
+ "submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
69
+ "ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
70
+ "number": True, "citation_quality": True, "anonymization": True},
71
+ },
72
+ "Strict": {
73
+ "check_metadata": True, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
74
+ "url_liveness": True, "retraction": True,
75
+ "submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
76
+ "ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
77
+ "number": True, "citation_quality": True, "anonymization": True},
78
+ },
79
+ }
80
+
81
+
82
+ # ----------------------------------------------------------------------- CSS
83
 
 
84
  CUSTOM_CSS = """
85
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
86
 
87
+ * { font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; }
 
 
 
88
 
89
+ /* Reserve space for the vertical scrollbar so expanding the Advanced
90
+ accordion (or anything else that adds content) doesn't shift the
91
+ layout horizontally. `overflow-y: scroll` on html is the universal
92
+ fallback for browsers without scrollbar-gutter.
93
+ `overflow-x: hidden` on body kills any page-width jitter coming from
94
+ inner elements that briefly overflow during streaming updates. */
95
+ html { scrollbar-gutter: stable; overflow-y: scroll; overflow-x: hidden; }
96
+ body { overflow-x: hidden; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  .gradio-container {
99
+ max-width: 1400px !important;
100
+ margin: 0 auto !important;
101
+ padding: 0 20px !important;
102
+ box-sizing: border-box !important;
103
  width: 100% !important;
104
+ overflow-x: hidden !important;
 
 
105
  }
106
 
107
+ /* Header strip */
108
+ .bg-header {
109
+ padding: 14px 4px 12px !important;
 
110
  border-bottom: 1px solid #e5e7eb;
111
+ margin-bottom: 14px;
112
  }
113
 
114
+ /* ==================================================================
115
+ Top toolbar — single horizontal row with all primary controls.
116
+ Every primary control has the SAME explicit 56px height. The little
117
+ filename/info chip beneath sits in a fixed 18px slot. The columns
118
+ wrap that into a 78px tall toolbar that's identical across cells.
119
+ ================================================================== */
120
+ .bg-toolbar {
121
+ margin-bottom: 14px;
122
+ gap: 10px !important;
123
+ align-items: flex-start !important;
124
  }
125
+ .bg-toolbar .gr-form { gap: 0 !important; }
126
+ .bg-toolbar .gr-block { border: none !important; box-shadow: none !important; padding: 0 !important; }
127
+
128
+ /* Common: any direct primary control fills column width */
129
+ .bg-toolbar > * { width: 100% !important; }
130
+
131
+ /* ---- Upload buttons ---- */
132
+ .bg-upload-btn,
133
+ .bg-upload-btn > .wrap,
134
+ .bg-upload-btn > div {
135
+ height: 56px !important;
136
+ min-height: 56px !important;
137
+ max-height: 56px !important;
138
+ width: 100% !important;
139
  }
140
+ .bg-upload-btn button {
141
+ height: 56px !important;
142
+ min-height: 56px !important;
143
+ max-height: 56px !important;
144
+ width: 100% !important;
145
+ padding: 0 14px !important;
146
+ font-size: 13px !important;
147
+ font-weight: 500 !important;
148
+ border-radius: 8px !important;
149
+ border: 1px dashed #cbd5e1 !important;
150
+ background: #f8fafc !important;
151
+ color: #334155 !important;
152
+ transition: border 0.15s, background 0.15s !important;
153
+ line-height: 1 !important;
154
  }
155
+ .bg-upload-btn button:hover {
156
+ border-color: #2563eb !important;
157
+ background: #eff6ff !important;
158
+ color: #1e3a8a !important;
 
 
 
 
 
 
159
  }
160
 
161
+ /* ---- Run / Stop button (same column, visibility-swapped) ---- */
162
+ .bg-run-btn,
163
+ .bg-run-btn > .wrap,
164
+ .bg-run-btn > div {
165
+ height: 56px !important;
166
+ min-height: 56px !important;
167
+ max-height: 56px !important;
168
+ width: 100% !important;
169
  }
170
+ .bg-run-btn button {
171
+ height: 56px !important;
172
+ min-height: 56px !important;
173
+ max-height: 56px !important;
174
+ width: 100% !important;
175
+ font-weight: 600 !important;
176
+ border-radius: 8px !important;
177
+ font-size: 14px !important;
178
+ line-height: 1 !important;
179
+ padding: 0 16px !important;
180
  }
181
+ .bg-stop-btn button {
182
+ background: #dc2626 !important;
183
+ color: white !important;
184
+ border: none !important;
 
 
185
  }
186
+ .bg-stop-btn button:hover { background: #b91c1c !important; }
187
+
188
+ /* ---- Preset radio as horizontal pill chips ---- */
189
+ .bg-preset,
190
+ .bg-preset > div,
191
+ .bg-preset > .wrap {
192
+ height: 56px !important;
193
+ min-height: 56px !important;
194
+ max-height: 56px !important;
195
+ padding: 0 !important;
196
  }
197
+ .bg-preset > label,
198
+ .bg-preset .label-wrap { display: none !important; }
199
+ .bg-preset .wrap,
200
+ .bg-preset > div > div,
201
+ .bg-preset fieldset {
202
+ display: flex !important;
203
+ flex-direction: row !important;
204
+ gap: 4px !important;
205
+ flex-wrap: nowrap !important;
206
+ width: 100% !important;
207
+ height: 56px !important;
208
+ align-items: stretch !important;
209
+ border: none !important;
210
+ padding: 0 !important;
211
+ margin: 0 !important;
212
  }
213
+ .bg-preset label {
214
+ flex: 1 1 0 !important;
215
+ margin: 0 !important;
216
+ padding: 0 8px !important;
217
+ height: 56px !important;
218
+ min-height: 56px !important;
219
+ max-height: 56px !important;
220
+ border-radius: 8px !important;
221
+ font-size: 13px !important;
222
+ font-weight: 500 !important;
223
+ border: 1px solid #e5e7eb !important;
224
+ background: #ffffff !important;
225
+ cursor: pointer !important;
226
+ text-align: center !important;
227
+ display: inline-flex !important;
228
+ align-items: center !important;
229
+ justify-content: center !important;
230
+ line-height: 1 !important;
231
+ color: #475569 !important;
232
+ transition: background 0.15s, border 0.15s !important;
233
+ white-space: nowrap !important;
234
  }
235
+ .bg-preset label:hover { background: #f8fafc !important; border-color: #cbd5e1 !important; }
236
+ .bg-preset input[type="radio"] { display: none !important; }
237
+ .bg-preset label.selected,
238
+ .bg-preset label:has(input:checked) {
239
+ background: #1e3a8a !important;
240
+ color: #ffffff !important;
241
+ border-color: #1e3a8a !important;
 
 
 
 
 
 
242
  }
243
 
244
+ /* ---- Caption chip beneath each toolbar control ---- */
245
+ .bg-fname {
246
+ font-size: 11.5px;
247
+ color: #94a3b8;
248
+ padding: 4px 8px 0 8px;
249
+ line-height: 1.3;
250
+ overflow: hidden;
251
+ text-overflow: ellipsis;
252
+ white-space: nowrap;
253
+ height: 18px;
254
+ box-sizing: content-box;
255
  }
256
+ .bg-fname.ok { color: #166534; font-weight: 500; }
257
+
258
+ /* ==================================================================
259
+ Advanced settings — gr.Row with each Checkbox as its own card.
260
+ Trick: `display: contents` on Gradio's intermediate wrapper makes
261
+ it vanish from the layout tree, so the actual checkbox blocks
262
+ become direct flex children of .bg-row. Card style is applied to
263
+ each block, not the wrapper, so we get N cards per row instead of
264
+ one big box.
265
+ ================================================================== */
266
+ .bg-row {
267
+ display: flex !important;
268
+ flex-direction: row !important;
269
+ gap: 10px !important;
270
+ align-items: stretch !important;
271
+ padding: 4px 0 !important;
272
  }
273
 
274
+ /* Flatten Gradio's intermediate `.form` / `.gr-form` wrapper so its
275
+ children become direct flex items of .bg-row. */
276
+ .bg-row > .form,
277
+ .bg-row > .gr-form {
278
+ display: contents !important;
279
+ }
280
+ /* Some Gradio versions emit a plain `<div>` wrapper instead of `.form`.
281
+ We can't safely `display: contents` every direct div (the spacer is
282
+ one), but if the wrapper has only blocks inside, contents flatten it. */
283
+ .bg-row > div:not(.bg-row-spacer):not(.gr-block):not(.block) {
284
+ display: contents !important;
 
 
 
 
 
 
285
  }
286
 
287
+ /* Each individual checkbox block = a card */
288
+ .bg-row .gr-block,
289
+ .bg-row .block {
290
+ flex: 1 1 0 !important;
291
+ min-width: 0 !important;
292
+ background: #f8fafc !important;
293
+ border: 1px solid #e5e7eb !important;
294
+ border-radius: 8px !important;
295
+ padding: 8px 12px !important;
296
+ box-shadow: none !important;
297
+ transition: background 0.15s, border 0.15s !important;
298
  }
299
+ .bg-row .gr-block:hover,
300
+ .bg-row .block:hover {
301
+ background: #eff6ff !important;
302
+ border-color: #cbd5e1 !important;
303
  }
304
+ .bg-row label,
305
+ .bg-row .gr-checkbox label {
306
+ font-size: 13px !important;
307
+ font-weight: 500 !important;
308
+ line-height: 1.3 !important;
309
+ color: #334155 !important;
310
+ margin: 0 !important;
311
+ padding: 0 !important;
 
 
 
312
  }
313
+ .bg-row .gr-info, .bg-row [class*="info"] { display: none !important; }
314
 
315
+ /* Spacer — invisible flex item that just preserves alignment */
316
+ .bg-row .bg-row-spacer {
317
+ flex: 1 1 0 !important;
318
+ background: transparent !important;
319
+ border: none !important;
320
+ box-shadow: none !important;
321
+ padding: 0 !important;
322
+ visibility: hidden !important;
 
323
  }
324
 
325
+ /* ==================================================================
326
+ Status strip thin one-liner above the report.
327
+ The Gradio HTML wrapper itself is pinned to its parent column's width
328
+ so no inner content can change the page geometry during streaming.
329
+ ================================================================== */
330
+ #bg-status-wrap,
331
+ #bg-status-wrap > * {
332
+ width: 100% !important;
333
+ max-width: 100% !important;
334
+ min-width: 0 !important;
335
+ box-sizing: border-box !important;
336
+ overflow-x: hidden !important;
337
  }
338
+ .bg-status {
339
+ padding: 10px 14px;
340
+ border-radius: 10px;
341
+ background: #f8fafc;
342
+ border: 1px solid #e2e8f0;
343
+ font-size: 12.5px;
344
+ line-height: 1.45;
345
+ color: #334155;
346
+ margin: 8px 0 12px 0;
347
+ max-width: 100%;
348
+ overflow: hidden; /* never let inline content widen the page */
349
+ box-sizing: border-box;
350
+ }
351
+ .bg-status-row {
352
  display: flex;
353
+ align-items: center;
354
+ gap: 14px;
355
+ flex-wrap: nowrap; /* one row, ellipsize the middle */
356
+ min-width: 0;
357
+ width: 100%;
358
  }
359
+ .bg-status .bg-status-stage {
 
 
360
  font-weight: 600;
361
+ color: #1e3a8a;
 
 
 
 
 
 
 
 
 
 
 
362
  display: inline-flex;
363
  align-items: center;
364
+ gap: 8px;
365
+ flex-shrink: 0;
366
+ white-space: nowrap;
 
367
  }
368
+ .bg-status .bg-status-detail {
369
+ color: #475569;
370
+ flex: 1 1 0;
371
+ min-width: 0;
372
+ overflow: hidden;
373
+ text-overflow: ellipsis;
374
+ white-space: nowrap;
 
 
 
 
 
375
  }
376
+ .bg-status .bg-status-detail code {
377
+ background: #eef2ff;
378
+ padding: 1px 6px;
 
379
  border-radius: 4px;
380
+ font-size: 11.5px;
381
+ color: #1e3a8a;
 
382
  }
383
+ .bg-status .bg-status-meta {
384
+ color: #64748b;
385
+ font-size: 11.5px;
386
+ display: inline-flex;
387
+ flex-wrap: nowrap;
388
  gap: 12px;
389
+ flex-shrink: 0;
390
+ white-space: nowrap;
391
  }
392
+ .bg-status.done { background: #f0fdf4; border-color: #bbf7d0; }
393
+ .bg-status.done .bg-status-stage { color: #166534; }
394
+ .bg-status.error { background: #fef2f2; border-color: #fecaca; }
395
+ .bg-status.error .bg-status-stage { color: #b91c1c; }
396
+ .bg-status .spin {
397
+ display: inline-block;
398
+ width: 10px; height: 10px;
399
+ border: 2px solid #cbd5e1;
400
+ border-top-color: #2563eb;
401
+ border-radius: 50%;
402
+ animation: bg-spin 0.9s linear infinite;
403
  }
404
+ @keyframes bg-spin { to { transform: rotate(360deg); } }
405
 
406
+ /* ==================================================================
407
+ Report area — full-width iframe.
408
+ ================================================================== */
409
+ .bg-main { padding: 0 !important; }
410
+ .bg-report-iframe {
411
+ width: 100%;
412
+ height: 80vh;
413
+ min-height: 620px;
414
+ border: 1px solid #e5e7eb;
415
+ border-radius: 12px;
416
+ background: white;
417
+ box-shadow: 0 1px 2px rgba(0,0,0,0.04);
418
  }
419
 
420
+ /* Empty / error placeholder (full-width, centered card) */
421
+ .bg-empty {
422
+ display: flex; align-items: center; justify-content: center;
423
+ flex-direction: column; gap: 14px;
424
+ min-height: 60vh;
425
+ color: #6b7280; text-align: center;
426
+ border: 2px dashed #e5e7eb; border-radius: 12px;
427
+ padding: 56px 24px;
428
+ background: #fafafa;
429
  }
430
+ .bg-empty .bg-empty-icon { font-size: 56px; line-height: 1; }
431
+ .bg-empty .bg-empty-title { font-size: 17px; font-weight: 600; color: #374151; }
432
+ .bg-empty .bg-empty-hint { font-size: 14px; max-width: 580px; line-height: 1.6; }
433
+ .bg-empty .bg-empty-hint code { background: #f3f4f6; padding: 1px 6px; border-radius: 4px; font-size: 13px; }
434
+
435
+ /* Compact downloads section */
436
+ .bg-downloads { gap: 6px !important; }
437
+ .bg-downloads .gr-file { min-height: auto !important; }
438
+ .bg-downloads .bg-file-input > label > div {
439
+ height: 52px !important;
440
+ min-height: 52px !important;
441
+ max-height: 52px !important;
442
  }
443
 
444
+ /* Footer */
445
+ .bg-footer {
 
 
 
446
  text-align: center;
447
+ margin-top: 18px;
448
+ padding-top: 12px;
449
+ border-top: 1px solid #f1f5f9;
450
+ font-size: 11.5px;
451
+ color: #9ca3af;
452
  }
453
+ .bg-footer code { background: #f3f4f6; padding: 1px 5px; border-radius: 3px; font-size: 11px; }
454
+ .bg-footer a { color: #6b7280; text-decoration: none; }
455
+ .bg-footer a:hover { text-decoration: underline; }
456
+
457
+ /* Trim accordion chrome a bit */
458
+ .gr-accordion { border-radius: 10px !important; border: 1px solid #e5e7eb !important; }
459
+ .gr-accordion > .label-wrap { padding: 8px 12px !important; font-size: 13px !important; }
460
+
461
+ @media (prefers-color-scheme: dark) {
462
+ .bg-empty { background: #161b22; border-color: #2a313c; color: #9ca3af; }
463
+ .bg-empty .bg-empty-title { color: #e6edf3; }
464
+ .bg-empty .bg-empty-hint code { background: #21262d; }
465
+ .bg-report-iframe { background: #0d1117; border-color: #2a313c; box-shadow: none; }
466
+ .bg-status { background: #0f172a; border-color: #1e293b; color: #cbd5e1; }
467
+ .bg-status .bg-status-stage { color: #93c5fd; }
468
+ .bg-status .bg-status-detail { color: #94a3b8; }
469
+ .bg-status .bg-status-detail code { background: #1e293b; color: #93c5fd; }
470
+ .bg-status .bg-status-meta { color: #64748b; }
471
+ .bg-status.done { background: #052e1a; border-color: #14532d; }
472
+ .bg-status.done .bg-status-stage { color: #86efac; }
473
+ .bg-status.error { background: #2a0e0e; border-color: #7f1d1d; }
474
+ .bg-preset label { background: #161b22 !important; border-color: #2a313c !important; color: #cbd5e1 !important; }
475
+ .bg-preset label:hover { background: #1e293b !important; }
476
+ .bg-preset .selected { background: #2563eb !important; border-color: #2563eb !important; }
477
+ .bg-footer { border-color: #1e293b; }
478
  }
479
+ """
480
 
 
 
 
 
 
 
481
 
482
+ EMPTY_PANEL_HTML = """
483
+ <div class="bg-empty">
484
+ <div class="bg-empty-icon">📄</div>
485
+ <div class="bg-empty-title">Your interactive report appears here</div>
486
+ <div class="bg-empty-hint">
487
+ Upload a <code>.bib</code> file and a <code>.tex</code> file in the toolbar above,
488
+ pick a preset, then press <strong>Run check</strong>. The report renders as a
489
+ self-contained HTML page with per-section filters, full-text search,
490
+ inline span highlighting, and dark-mode support.
491
+ </div>
492
+ </div>
493
  """
494
 
495
+ EMPTY_STATUS_HTML = (
496
+ '<div class="bg-status">'
497
+ '<div class="bg-status-row">'
498
+ '<span class="bg-status-stage">○ Idle</span>'
499
+ '<span class="bg-status-detail">Upload <code>.bib</code> + <code>.tex</code> '
500
+ 'and press <strong>Run check</strong> to begin.</span>'
501
+ '</div></div>'
502
+ )
503
+
504
+
505
+ def _placeholder(message: str, color: str = "#b91c1c") -> str:
506
+ """Inline error/info card shown in place of the iframe."""
507
+ return (
508
+ f'<div class="bg-empty" style="color:{color};border-color:{color}33">'
509
+ f'<div class="bg-empty-icon">⚠️</div>'
510
+ f'<div class="bg-empty-title">{message}</div>'
511
+ f'</div>'
512
+ )
513
+
514
+
515
+ def _html_to_iframe(html: str) -> str:
516
+ """
517
+ Embed an HTML document inside ``<iframe srcdoc>``.
518
+
519
+ We escape only ``&`` and ``"`` — these are the two characters that can
520
+ break the attribute value or get re-decoded as entities. ``<`` and ``>``
521
+ must stay raw, otherwise the inner document would be HTML-encoded.
522
+ """
523
+ escaped = html.replace("&", "&amp;").replace('"', "&quot;")
524
+ return (
525
+ f'<iframe class="bg-report-iframe" srcdoc="{escaped}" '
526
+ f'sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" '
527
+ f'loading="lazy"></iframe>'
528
+ )
529
+
530
+
531
+ def _status_html(stage: str, detail: str = "", meta: list[str] | None = None,
532
+ state: str = "running") -> str:
533
+ """Render the live-status strip shown above the report.
534
+
535
+ Layout is a single horizontal row: [stage] [detail] [meta chips].
536
+ Wraps cleanly on narrow screens.
537
+ """
538
+ if state == "running":
539
+ stage_icon = '<span class="spin"></span>'
540
+ elif state == "done":
541
+ stage_icon = '<span>✓</span>'
542
+ elif state == "error":
543
+ stage_icon = '<span>⚠</span>'
544
+ else:
545
+ stage_icon = '<span>○</span>'
546
+ detail_html = f'<span class="bg-status-detail">{detail}</span>' if detail else '<span class="bg-status-detail"></span>'
547
+ meta_html = ""
548
+ if meta:
549
+ meta_html = (
550
+ '<span class="bg-status-meta">'
551
+ + " ".join(f"<span>{m}</span>" for m in meta)
552
+ + "</span>"
553
+ )
554
+ return (
555
+ f'<div class="bg-status {state}">'
556
+ f'<div class="bg-status-row">'
557
+ f'<span class="bg-status-stage">{stage_icon}<span>{stage}</span></span>'
558
+ f'{detail_html}{meta_html}'
559
+ f'</div></div>'
560
+ )
561
+
562
+
563
+ # --------------------------------------------------------------- config glue
564
 
565
  def create_config_from_ui(
566
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
567
+ caption, reference, formatting, equation, ai_artifacts,
568
+ sentence, consistency, acronym, number, citation_quality, anonymization,
 
 
 
 
 
 
 
 
 
 
 
 
569
  ) -> BibGuardConfig:
 
570
  config = BibGuardConfig()
 
571
  config.bibliography = BibliographyConfig(
572
  check_metadata=check_metadata,
573
  check_usage=check_usage,
574
  check_duplicates=check_duplicates,
575
  check_preprint_ratio=check_preprint_ratio,
576
+ check_relevance=False, # LLM disabled in web mode
577
  )
 
578
  config.submission = SubmissionConfig(
579
+ caption=caption, reference=reference, formatting=formatting, equation=equation,
580
+ ai_artifacts=ai_artifacts, sentence=sentence, consistency=consistency,
581
+ acronym=acronym, number=number, citation_quality=citation_quality,
582
+ anonymization=anonymization,
 
 
 
 
 
 
 
583
  )
 
584
  config.output = OutputConfig(quiet=True, minimal_verified=False)
 
585
  return config
586
 
587
 
588
+ def apply_preset(name: str):
589
+ p = PRESETS.get(name, PRESETS["Standard"])
590
+ sub = p["submission"]
591
+ return (
592
+ p["check_metadata"], p["check_usage"], p["check_duplicates"], p["check_preprint_ratio"],
593
+ sub["caption"], sub["reference"], sub["formatting"], sub["equation"],
594
+ sub["ai_artifacts"], sub["sentence"], sub["consistency"], sub["acronym"],
595
+ sub["number"], sub["citation_quality"], sub["anonymization"],
596
+ p["url_liveness"], p["retraction"],
597
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
 
600
+ _PRESET_CAPTIONS = {
601
+ "Quick": "local checks only · no network · instant",
602
+ "Standard": "local checks + retraction lookup (CrossRef)",
603
+ "Strict": "+ URL liveness + multi-source metadata (slow)",
604
+ }
605
+
606
+
607
+ def _preset_caption_html(name: str) -> str:
608
+ text = _PRESET_CAPTIONS.get(name, "")
609
+ return f'<div class="bg-fname" style="text-align:center">{text}</div>'
610
 
611
 
612
+ # ------------------------------------------------------------------ run_check
613
+ # Streaming generator. Each yield is a 7-tuple:
614
+ # (iframe_html, status_html, html_path, md_path, json_path,
615
+ # cleaned_bib_path, log_path)
616
+ # `capture_run` attaches a per-run DEBUG file handler so any exception or
617
+ # warning anywhere in the pipeline is recorded with full traceback at
618
+ # `<out_dir>/bibguard.log`, which is then downloadable. The status panel
619
+ # surfaces warning+error counts so problems aren't invisible.
620
+
621
  def run_check(
622
+ bib_file, tex_file,
623
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
624
+ caption, reference, formatting, equation, ai_artifacts,
625
+ sentence, consistency, acronym, number, citation_quality, anonymization,
626
+ url_liveness=False, retraction=True,
627
+ ):
628
+ """Run the full check pipeline as a streaming generator with per-run logging.
629
+
630
+ `bib_file` / `tex_file` are filesystem path strings (carried by gr.State),
631
+ not gr.File objects. The status panel is the single source of progress
632
+ feedback — no separate gr.Progress bar.
633
+ """
634
+ started = time.time()
635
+
636
+ def _elapsed() -> str:
637
+ return f"⏱ {int(time.time() - started)}s"
638
+
639
+ # Initial state: keep current report (None means clear).
640
+ if not bib_file or not tex_file:
641
+ yield (
642
+ _placeholder("Please choose both a .bib and a .tex file in the toolbar."),
643
+ _status_html("Waiting for files",
644
+ "Pick a .bib and a .tex file from the toolbar to start.",
645
+ state="error"),
646
+ None, None, None, None, None,
 
647
  )
648
+ return
649
+
650
+ # Allocate the artifact dir up-front so the per-run log lives next to
651
+ # the report files.
652
+ out_dir = Path(tempfile.mkdtemp(prefix="bibguard_"))
653
+ log_path_target = out_dir / "bibguard.log"
654
+
655
+ # Reset per-source circuit breakers so a previous run's flaky source
656
+ # doesn't carry over and skip valid lookups in this run.
657
+ http_layer.reset_breakers()
658
+
659
+ with capture_run(target_path=log_path_target) as (log_path, log_stats):
660
+ logger.info("=== run_check start: bib=%s tex=%s ===", bib_file, tex_file)
661
+ try:
662
+ yield from _run_check_impl(
663
+ bib_file, tex_file, out_dir, log_path, log_stats,
664
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
665
+ caption, reference, formatting, equation, ai_artifacts,
666
+ sentence, consistency, acronym, number, citation_quality, anonymization,
667
+ url_liveness, retraction, started, _elapsed,
668
+ )
669
+ except Exception as e:
670
+ logger.exception("run_check crashed (entry-level guard)")
671
+ yield (
672
+ _placeholder(f"Unhandled error: {e}"),
673
+ _status_html("Failed", f"{e} — see <code>bibguard.log</code> for the full traceback.",
674
+ state="error"),
675
+ None, None, None, None, str(log_path),
676
+ )
677
+ finally:
678
+ logger.info("=== run_check end: warnings=%d errors=%d ===",
679
+ log_stats.warnings, log_stats.errors)
680
+
681
+
682
+ def _run_check_impl(
683
+ bib_file, tex_file, out_dir, log_path, log_stats,
684
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
685
+ caption, reference, formatting, equation, ai_artifacts,
686
+ sentence, consistency, acronym, number, citation_quality, anonymization,
687
+ url_liveness, retraction, started, _elapsed,
688
+ ):
689
+ """Inner pipeline. Wrapped in `capture_run` by `run_check`.
690
+
691
+ Every yield is a 7-tuple ending with the log path so the user can
692
+ download `bibguard.log` even from intermediate updates.
693
+ """
694
+ log_path_str = str(log_path)
695
+
696
+ bib_path = Path(bib_file)
697
+ tex_path = Path(tex_file)
698
+ logger.info("Inputs: bib=%s tex=%s out_dir=%s", bib_path, tex_path, out_dir)
699
+
700
+ def _meta_with_logs(extra: list[str]) -> list[str]:
701
+ out = list(extra)
702
+ if log_stats.warnings or log_stats.errors:
703
+ out.append(f"⚠ {log_stats.warnings}w / {log_stats.errors}e logged")
704
+ return out
705
+
706
+ yield (
707
+ gr.update(),
708
+ _status_html("Validating files",
709
+ f"Reading <code>{bib_path.name}</code> and <code>{tex_path.name}</code>",
710
+ meta=_meta_with_logs([_elapsed()])),
711
+ None, None, None, None, log_path_str,
712
+ )
713
+
714
+ # Pre-flight content validation
715
+ bib_rep = validate_bib(bib_path)
716
+ tex_rep = validate_tex(tex_path)
717
+ msg = "\n".join(filter(None, [
718
+ format_report(bib_rep, bib_path.name),
719
+ format_report(tex_rep, tex_path.name),
720
+ ]))
721
+ if not bib_rep.ok or not tex_rep.ok:
722
+ logger.error("File validation failed:\n%s", msg)
723
+ block = (
724
+ f'<div class="bg-empty" style="color:#b91c1c;border-color:#b91c1c33">'
725
+ f'<div class="bg-empty-icon">⚠️</div>'
726
+ f'<div class="bg-empty-title">File validation failed</div>'
727
+ f'<pre style="white-space:pre-wrap;font-size:13px;color:#7f1d1d;'
728
+ f'background:#fef2f2;padding:12px;border-radius:6px;max-width:540px">{msg}</pre>'
729
+ f'</div>'
730
  )
731
+ yield (
732
+ block,
733
+ _status_html("File validation failed", msg.replace("\n", "<br>"),
734
+ state="error"),
735
+ None, None, None, None, log_path_str,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  )
737
+ return
738
+ elif msg:
739
+ logger.info("Validation warnings:\n%s", msg)
740
+
741
+ config = create_config_from_ui(
742
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
743
+ caption, reference, formatting, equation, ai_artifacts,
744
+ sentence, consistency, acronym, number, citation_quality, anonymization,
745
+ )
746
+
747
+ yield (
748
+ gr.update(),
749
+ _status_html("Parsing", "Loading bibliography and LaTeX source",
750
+ meta=_meta_with_logs([_elapsed()])),
751
+ None, None, None, None, log_path_str,
752
+ )
753
+
754
+ tex_content = tex_path.read_text(encoding='utf-8', errors='replace')
755
+ bib_parser = BibParser()
756
+ entries = bib_parser.parse_file(str(bib_path))
757
+ tex_parser = TexParser()
758
+ tex_parser.parse_file(str(tex_path))
759
+ logger.info("Parsed %d bib entries from %s", len(entries), bib_path.name)
760
+
761
+ bib_config = config.bibliography
762
+
763
+ # Init components
764
+ arxiv_fetcher = crossref_fetcher = ss_fetcher = oa_fetcher = dblp_fetcher = None
765
+ comparator = usage_checker = duplicate_detector = None
766
+
767
+ if bib_config.check_metadata:
768
+ arxiv_fetcher = ArxivFetcher()
769
+ ss_fetcher = SemanticScholarFetcher()
770
+ oa_fetcher = OpenAlexFetcher()
771
+ dblp_fetcher = DBLPFetcher()
772
+ crossref_fetcher = CrossRefFetcher()
773
+ comparator = MetadataComparator()
774
+ if bib_config.check_usage:
775
+ usage_checker = UsageChecker(tex_parser)
776
+ if bib_config.check_duplicates:
777
+ duplicate_detector = DuplicateDetector()
778
+
779
+ report_gen = ReportGenerator(
780
+ minimal_verified=False,
781
+ check_preprint_ratio=bib_config.check_preprint_ratio,
782
+ preprint_warning_threshold=bib_config.preprint_warning_threshold,
783
+ )
784
+ report_gen.set_metadata([str(bib_path)], [str(tex_path)])
785
+
786
+ # Submission quality checks
787
+ yield (
788
+ gr.update(),
789
+ _status_html("LaTeX quality checks",
790
+ f"Running {len(config.submission.get_enabled_checkers())} checkers on the LaTeX source",
791
+ meta=_meta_with_logs([f"📚 {len(entries)} bib entries", _elapsed()])),
792
+ None, None, None, None, log_path_str,
793
+ )
794
+ submission_results = []
795
+ for name in config.submission.get_enabled_checkers():
796
+ if name in CHECKER_REGISTRY:
797
+ try:
798
+ checker = CHECKER_REGISTRY[name]()
799
  results = checker.check(tex_content, {})
800
  for r in results:
801
+ r.file_path = str(tex_path)
802
  submission_results.extend(results)
803
+ except Exception:
804
+ logger.exception("Checker %s crashed", name)
805
+ report_gen.set_submission_results(submission_results, None)
806
+
807
+ if bib_config.check_duplicates and duplicate_detector:
808
+ try:
809
+ report_gen.set_duplicate_groups(duplicate_detector.find_duplicates(entries))
810
+ except Exception:
811
+ logger.exception("Duplicate detection crashed")
812
+ if bib_config.check_usage and usage_checker:
813
+ try:
814
+ report_gen.set_missing_citations(usage_checker.get_missing_entries(entries))
815
+ except Exception:
816
+ logger.exception("Missing-citation lookup crashed")
817
+
818
+ # Per-entry workflow
819
+ total = max(1, len(entries))
820
+ workflow_config = get_default_workflow()
821
+ verified_count = 0
822
+ flagged_count = 0
823
+ not_found_count = 0
824
+ last_yield = time.time()
825
+
826
+ def _identifier_chip(entry) -> str:
827
+ """Tiny inline hint about which IDs we have for this entry."""
828
+ bits = []
829
+ if entry.doi: bits.append("DOI")
830
+ if entry.has_arxiv: bits.append("arXiv")
831
+ if entry.title and not bits: bits.append("title")
832
+ elif entry.title: bits.append("title")
833
+ return " + ".join(bits) if bits else "no identifiers"
834
+
835
+ def _outcome_label(cmp) -> str:
836
+ if cmp is None:
837
+ return ""
838
+ if cmp.source == "unable":
839
+ return "<span style='color:#b45309'>? no metadata</span>"
840
+ if cmp.is_match:
841
+ return f"<span style='color:#166534'>✓ verified by {cmp.source}</span>"
842
+ return f"<span style='color:#b45309'>⚠ flagged ({cmp.source})</span>"
843
+
844
+ for i, entry in enumerate(entries):
845
+ # ── Pre-fetch status: announce identifier set BEFORE the network roundtrip
846
+ # so the user sees what's being attempted, not just the entry name.
847
+ if bib_config.check_metadata and comparator:
848
+ now = time.time()
849
+ if now - last_yield > 0.4 or i == 0:
850
+ ids = _identifier_chip(entry)
851
+ detail = f"<code>{entry.key}</code> · querying via <strong>{ids}</strong>"
852
+ if entry.title:
853
+ short = entry.title[:70] + ("…" if len(entry.title) > 70 else "")
854
+ detail += f" — <span style='color:#64748b'>{short}</span>"
855
+ yield (
856
+ gr.update(),
857
+ _status_html(
858
+ f"Verifying entry {i + 1}/{total}",
859
+ detail,
860
+ meta=_meta_with_logs([
861
+ f"📚 {total} total",
862
+ f"✓ {verified_count}",
863
+ f"⚠ {flagged_count}",
864
+ f"? {not_found_count}",
865
+ _elapsed(),
866
+ ]),
867
+ ),
868
+ None, None, None, None, log_path_str,
869
+ )
870
+ last_yield = now
871
+
872
+ usage_result = None
873
+ comparison_result = None
874
+ try:
875
  if usage_checker:
876
  usage_result = usage_checker.check_usage(entry)
877
+ except Exception:
878
+ logger.exception("Usage check crashed for entry=%s", entry.key)
879
+ try:
880
  if bib_config.check_metadata and comparator:
881
  comparison_result = fetch_and_compare_with_workflow(
882
  entry, workflow_config, arxiv_fetcher, crossref_fetcher,
883
+ ss_fetcher, oa_fetcher, dblp_fetcher, comparator,
884
  )
885
+ if comparison_result is None or comparison_result.source == "unable":
886
+ not_found_count += 1
887
+ elif comparison_result.is_match:
888
+ verified_count += 1
889
+ else:
890
+ flagged_count += 1
891
+ except Exception:
892
+ logger.exception("Metadata fetch crashed for entry=%s", entry.key)
893
+ report_gen.add_entry_report(EntryReport(
894
+ entry=entry, comparison=comparison_result,
895
+ usage=usage_result, evaluations=[],
896
+ ))
897
+
898
+ # ── Post-fetch status: show outcome inline so the user can watch
899
+ # results stream in (verified / flagged / not found).
900
+ now = time.time()
901
+ if now - last_yield > 0.4 or i == total - 1:
902
+ outcome = _outcome_label(comparison_result)
903
+ detail_parts = [f"<code>{entry.key}</code>"]
904
+ if outcome:
905
+ detail_parts.append(outcome)
906
+ if entry.title:
907
+ short = entry.title[:70] + ("…" if len(entry.title) > 70 else "")
908
+ detail_parts.append(f"<span style='color:#64748b'>{short}</span>")
909
+ detail = " · ".join(detail_parts)
910
+ meta = _meta_with_logs([
911
+ f"📚 {i + 1}/{total}",
912
+ f"✓ {verified_count}",
913
+ f"⚠ {flagged_count}",
914
+ f"? {not_found_count}",
915
+ _elapsed(),
916
+ ])
917
+ yield (
918
+ gr.update(),
919
+ _status_html(f"Bibliography {i + 1}/{total}", detail, meta=meta),
920
+ None, None, None, None, log_path_str,
921
  )
922
+ last_yield = now
923
+
924
+ if retraction:
925
+ try:
926
+ doi_count = sum(1 for e in entries if getattr(e, "doi", ""))
927
+ yield (
928
+ gr.update(),
929
+ _status_html("Retraction lookups",
930
+ f"Querying CrossRef for {doi_count} DOI(s)",
931
+ meta=_meta_with_logs([_elapsed()])),
932
+ None, None, None, None, log_path_str,
933
+ )
934
+ report_gen.set_retraction_findings(RetractionChecker().check_entries(entries))
935
+ except Exception:
936
+ logger.exception("Retraction lookup crashed")
937
+
938
+ if url_liveness:
939
+ try:
940
+ url_count = sum(1 for e in entries if getattr(e, "url", ""))
941
+ yield (
942
+ gr.update(),
943
+ _status_html("URL liveness",
944
+ f"HEAD-checking {url_count} URL(s) in parallel",
945
+ meta=_meta_with_logs([_elapsed()])),
946
+ None, None, None, None, log_path_str,
947
+ )
948
+ report_gen.set_url_findings(URLChecker().check_entries(entries))
949
+ except Exception:
950
+ logger.exception("URL liveness crashed")
951
+
952
+ # Save artifacts
953
+ yield (
954
+ gr.update(),
955
+ _status_html("Building report",
956
+ "Rendering self-contained HTML, JSON, and Markdown",
957
+ meta=_meta_with_logs([_elapsed()])),
958
+ None, None, None, None, log_path_str,
959
+ )
960
+ html_path = out_dir / "report.html"
961
+ md_path = out_dir / "bibliography_report.md"
962
+ json_path = out_dir / "report.json"
963
+ cleaned_bib_path: Path | None = None
964
 
965
+ try:
966
+ report_gen.save_html(str(html_path))
967
+ report_gen.save_bibliography_report(str(md_path))
968
+ report_gen.save_json(str(json_path))
969
+ if usage_checker:
970
+ used_keys = {er.entry.key for er in report_gen.entries if er.usage and er.usage.is_used}
971
+ if used_keys:
972
+ cleaned_bib_path = out_dir / f"{bib_path.stem}_only_used.bib"
973
+ bib_parser.filter_file(str(bib_path), str(cleaned_bib_path), used_keys)
974
+ except Exception:
975
+ logger.exception("Artifact generation failed")
976
 
977
+ # Embed report.html as iframe srcdoc
978
+ if html_path.exists():
979
+ iframe_html = _html_to_iframe(html_path.read_text(encoding='utf-8'))
980
+ else:
981
+ iframe_html = _placeholder("Report generation failed — see bibguard.log.")
982
+
983
+ meta = _meta_with_logs([
984
+ f"📚 {len(entries)} entries",
985
+ f"✓ {verified_count} verified",
986
+ f"⚠ {flagged_count} flagged",
987
+ _elapsed(),
988
+ ])
989
+ state = "done"
990
+ summary = "Report ready. Use the right pane to filter, search, and copy fixes."
991
+ if log_stats.errors > 0:
992
+ state = "error"
993
+ summary = (f"Done with {log_stats.errors} error(s) and {log_stats.warnings} warning(s) "
994
+ "logged — see <code>bibguard.log</code> for full tracebacks.")
995
+ elif log_stats.warnings > 0:
996
+ summary = (f"Report ready ({log_stats.warnings} warnings logged — see "
997
+ "<code>bibguard.log</code>).")
998
+
999
+ yield (
1000
+ iframe_html,
1001
+ _status_html("Done", summary, meta=meta, state=state),
1002
+ str(html_path) if html_path.exists() else None,
1003
+ str(md_path) if md_path.exists() else None,
1004
+ str(json_path) if json_path.exists() else None,
1005
+ str(cleaned_bib_path) if (cleaned_bib_path and cleaned_bib_path.exists()) else None,
1006
+ log_path_str,
1007
+ )
1008
+
1009
+
1010
+ # --------------------------------------------------------------------- layout
1011
 
1012
+ def create_app() -> gr.Blocks:
1013
+ # Inline app icon as a base64 data URL — works regardless of cwd.
1014
+ icon_html = '<span style="font-size:28px">🛡️</span>'
 
 
1015
  try:
1016
+ icon_path = Path(__file__).parent / "assets" / "icon-192.png"
1017
  if icon_path.exists():
1018
  with open(icon_path, "rb") as f:
1019
+ b64 = base64.b64encode(f.read()).decode()
1020
+ icon_html = (
1021
+ f'<img src="data:image/png;base64,{b64}" '
1022
+ f'style="width:32px;height:32px;border-radius:6px" alt="BibGuard">'
1023
+ )
1024
+ except Exception as e:
1025
+ logger.debug("Icon load failed; using emoji fallback: %s", e, exc_info=True)
1026
+
1027
+ with gr.Blocks(
1028
+ title="BibGuard Bibliography & LaTeX Quality Auditor",
1029
+ ) as app:
1030
+
1031
+ gr.HTML(f"""
1032
+ <div class="bg-header" style="display:flex;align-items:center;gap:10px">
1033
+ {icon_html}
1034
+ <strong style="font-size:18px">BibGuard</strong>
1035
+ <span style="color:#6b7280;font-size:13px">Bibliography & LaTeX quality auditor</span>
1036
+ <span style="flex:1"></span>
1037
+ <a href="https://github.com/thinkwee/BibGuard" target="_blank"
1038
+ style="color:#6b7280;text-decoration:none;font-size:13px">GitHub ↗</a>
1039
+ </div>
1040
+ """)
1041
+
1042
+ # ───────────────────────── Top toolbar ─────────────────────────
1043
+ # All primary controls on a single horizontal row, every primary
1044
+ # widget pinned to 56px height. gr.UploadButton replaces gr.File
1045
+ # because the latter's drop-zone doesn't shrink to a toolbar.
1046
+ with gr.Row(elem_classes=["bg-toolbar"]):
1047
+ with gr.Column(scale=2, min_width=200):
1048
+ bib_btn = gr.UploadButton(
1049
+ "📚 Choose .bib file",
1050
+ file_types=[".bib"], file_count="single",
1051
+ elem_classes=["bg-upload-btn"],
1052
+ )
1053
+ bib_status = gr.HTML('<div class="bg-fname">no file selected</div>')
1054
+ with gr.Column(scale=2, min_width=200):
1055
+ tex_btn = gr.UploadButton(
1056
+ "📄 Choose .tex file",
1057
+ file_types=[".tex"], file_count="single",
1058
+ elem_classes=["bg-upload-btn"],
1059
  )
1060
+ tex_status = gr.HTML('<div class="bg-fname">no file selected</div>')
1061
+ with gr.Column(scale=3, min_width=280):
1062
+ preset = gr.Radio(
1063
+ choices=list(PRESETS.keys()),
1064
+ value="Standard",
1065
+ show_label=False,
1066
+ elem_classes=["bg-preset"],
1067
  )
1068
+ preset_caption = gr.HTML(
1069
+ _preset_caption_html("Standard"),
1070
+ )
1071
+ with gr.Column(scale=1, min_width=140):
1072
+ run_btn = gr.Button("▶ Run check", variant="primary",
1073
+ elem_classes=["bg-run-btn"])
1074
+ stop_btn = gr.Button("◼ Stop", variant="stop",
1075
+ elem_classes=["bg-run-btn", "bg-stop-btn"],
1076
+ visible=False)
1077
+ gr.HTML('<div class="bg-fname" style="text-align:center">&nbsp;</div>')
1078
+
1079
+ # Holds the selected file paths (strings). Updated by the UploadButton
1080
+ # callbacks below so run_check sees plain paths regardless of how the
1081
+ # user picked the files.
1082
+ bib_path_state = gr.State(value=None)
1083
+ tex_path_state = gr.State(value=None)
1084
+
1085
+ # Advanced fine-grained toggles. Default closed — most users just
1086
+ # pick a preset and go. Each tab is composed of gr.Row blocks of
1087
+ # exactly 4 cells so columns line up vertically. Short rows are
1088
+ # padded with invisible spacer HTML.
1089
+ def _spacer():
1090
+ return gr.HTML('<div class="bg-row-spacer">&nbsp;</div>',
1091
+ elem_classes=["bg-row-spacer"])
1092
+
1093
+ with gr.Accordion("⚙️ Advanced settings", open=False):
1094
+ with gr.Tabs():
1095
+ with gr.TabItem("Bibliography"):
1096
+ with gr.Row(elem_classes=["bg-row"]):
1097
+ check_metadata = gr.Checkbox(label="Metadata verify", value=False)
1098
+ check_usage = gr.Checkbox(label="Usage", value=True)
1099
+ check_duplicates = gr.Checkbox(label="Duplicates", value=True)
1100
+ check_preprint_ratio = gr.Checkbox(label="Preprints", value=True)
1101
+ with gr.Row(elem_classes=["bg-row"]):
1102
+ retraction = gr.Checkbox(label="Retractions", value=True)
1103
+ url_liveness = gr.Checkbox(label="URL liveness", value=False)
1104
+ _spacer()
1105
+ _spacer()
1106
+
1107
+ with gr.TabItem("LaTeX format"):
1108
+ with gr.Row(elem_classes=["bg-row"]):
1109
+ caption = gr.Checkbox(label="Captions", value=True)
1110
+ reference = gr.Checkbox(label="References", value=True)
1111
+ formatting = gr.Checkbox(label="Formatting", value=True)
1112
+ equation = gr.Checkbox(label="Equations", value=True)
1113
+
1114
+ with gr.TabItem("Writing"):
1115
+ with gr.Row(elem_classes=["bg-row"]):
1116
+ ai_artifacts = gr.Checkbox(label="AI artifacts", value=True)
1117
+ sentence = gr.Checkbox(label="Sentences", value=True)
1118
+ consistency = gr.Checkbox(label="Consistency", value=True)
1119
+ acronym = gr.Checkbox(label="Acronyms", value=True)
1120
+ with gr.Row(elem_classes=["bg-row"]):
1121
+ number = gr.Checkbox(label="Numbers", value=True)
1122
+ citation_quality = gr.Checkbox(label="Citations", value=True)
1123
+ anonymization = gr.Checkbox(label="Anonymization", value=True)
1124
+ _spacer()
1125
+
1126
+ # ───────────────────────── Status strip ─────────────────────────
1127
+ status_panel = gr.HTML(value=EMPTY_STATUS_HTML, elem_id="bg-status-wrap")
1128
+
1129
+ # ───────────────────────── Report (full width) ───────────────────
1130
+ with gr.Row(elem_classes=["bg-main"]):
1131
+ report_panel = gr.HTML(value=EMPTY_PANEL_HTML)
1132
+
1133
+ # ───────────────────────── Downloads ────────────────────────────
1134
+ with gr.Accordion("📥 Downloads", open=False):
1135
+ with gr.Row(elem_classes=["bg-downloads"]):
1136
+ download_html = gr.File(label="report.html (offline)",
1137
+ interactive=False, elem_classes=["bg-file-input"])
1138
+ download_md = gr.File(label="bibliography_report.md",
1139
+ interactive=False, elem_classes=["bg-file-input"])
1140
+ download_json = gr.File(label="report.json",
1141
+ interactive=False, elem_classes=["bg-file-input"])
1142
+ download_bib = gr.File(label="cleaned .bib",
1143
+ interactive=False, elem_classes=["bg-file-input"])
1144
+ download_log = gr.File(label="bibguard.log",
1145
+ interactive=False, elem_classes=["bg-file-input"])
1146
+
1147
+ gr.HTML(
1148
+ '<div class="bg-footer">'
1149
+ 'Set <code>$BIBGUARD_CONTACT_EMAIL</code> for the polite-pool User-Agent · '
1150
+ f'persistent log at <code>{LOG_PATH}</code> · '
1151
+ 'set <code>BIBGUARD_DEBUG=1</code> for verbose console output.'
1152
+ '</div>'
1153
+ )
1154
+
1155
+ preset.change(
1156
+ fn=apply_preset,
1157
+ inputs=[preset],
1158
+ outputs=[
1159
+ check_metadata, check_usage, check_duplicates, check_preprint_ratio,
1160
+ caption, reference, formatting, equation,
1161
+ ai_artifacts, sentence, consistency, acronym,
1162
+ number, citation_quality, anonymization,
1163
+ url_liveness, retraction,
1164
+ ],
1165
+ )
1166
+ preset.change(
1167
+ fn=_preset_caption_html,
1168
+ inputs=[preset],
1169
+ outputs=[preset_caption],
1170
+ )
1171
+
1172
+ # ---- Upload-button callbacks: store path in state + update chip ----
1173
+
1174
+ def _on_bib_upload(f):
1175
+ if f is None:
1176
+ return None, '<div class="bg-fname">no file selected</div>'
1177
+ path = getattr(f, "name", str(f))
1178
+ return path, f'<div class="bg-fname ok">📚 {Path(path).name}</div>'
1179
+
1180
+ def _on_tex_upload(f):
1181
+ if f is None:
1182
+ return None, '<div class="bg-fname">no file selected</div>'
1183
+ path = getattr(f, "name", str(f))
1184
+ return path, f'<div class="bg-fname ok">📄 {Path(path).name}</div>'
1185
+
1186
+ bib_btn.upload(_on_bib_upload, inputs=[bib_btn], outputs=[bib_path_state, bib_status])
1187
+ tex_btn.upload(_on_tex_upload, inputs=[tex_btn], outputs=[tex_path_state, tex_status])
1188
+
1189
+ # Run pipeline:
1190
+ # 1. Toggle visibility: hide Run, show Stop.
1191
+ # 2. Stream run_check yields into report + status + downloads.
1192
+ # 3. After completion, swap buttons back.
1193
+ # Stop button cancels the streaming task via Gradio's `cancels=`.
1194
+ def _show_stop():
1195
+ return gr.update(visible=False), gr.update(visible=True)
1196
+
1197
+ def _show_run():
1198
+ return gr.update(visible=True), gr.update(visible=False)
1199
+
1200
+ run_event = run_btn.click(
1201
+ fn=_show_stop, inputs=None, outputs=[run_btn, stop_btn],
1202
+ ).then(
1203
  fn=run_check,
1204
  inputs=[
1205
+ bib_path_state, tex_path_state,
1206
  check_metadata, check_usage, check_duplicates, check_preprint_ratio,
1207
  caption, reference, formatting, equation, ai_artifacts,
1208
+ sentence, consistency, acronym, number, citation_quality, anonymization,
1209
+ url_liveness, retraction,
1210
  ],
1211
+ outputs=[report_panel, status_panel,
1212
+ download_html, download_md, download_json, download_bib, download_log],
1213
+ ).then(
1214
+ fn=_show_run, inputs=None, outputs=[run_btn, stop_btn],
1215
+ )
1216
+
1217
+ stop_btn.click(
1218
+ fn=lambda: (
1219
+ gr.update(visible=True),
1220
+ gr.update(visible=False),
1221
+ _status_html("Cancelled",
1222
+ "Run interrupted by user. Partial results discarded.",
1223
+ state="error"),
1224
+ ),
1225
+ inputs=None,
1226
+ outputs=[run_btn, stop_btn, status_panel],
1227
+ cancels=[run_event],
1228
  )
1229
+
1230
  return app
1231
 
1232
 
 
1233
  app = create_app()
1234
 
1235
+
1236
  if __name__ == "__main__":
1237
+ _favicon = Path(__file__).parent / "assets" / "icon-192.png"
1238
  app.launch(
1239
+ favicon_path=str(_favicon) if _favicon.exists() else None,
1240
  show_error=True,
1241
  css=CUSTOM_CSS,
1242
+ theme=gr.themes.Soft(),
1243
  )
app_helper.py CHANGED
@@ -1,98 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def fetch_and_compare_with_workflow(
2
- entry, workflow_steps, arxiv_fetcher, crossref_fetcher,
3
- semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
 
 
 
 
 
 
4
  ):
5
- """Fetch metadata from online sources using the configured workflow."""
6
- from src.utils.normalizer import TextNormalizer
7
-
8
- best_result = None
9
-
10
- # If no steps provided, use default order
11
- if not workflow_steps:
12
- # Create a default list of steps if needed, or simply handle logic here
13
- pass
14
-
15
- # Simplified workflow execution: Run through enabled steps
16
- # We manualy iterate through sources in a preferred order if workflow is not fully configured
17
- # Or iterate through the steps list.
18
-
19
- # Since extracting WorkflowConfig logic is complex, let's just implement a robust
20
- # default search strategy here which is what the user likely wants.
21
-
22
- results = []
23
-
24
- # 1. DBLP (High quality for CS)
25
- if dblp_fetcher and entry.title:
26
- try:
27
- dblp_result = dblp_fetcher.search_by_title(entry.title)
28
- if dblp_result:
29
- res = comparator.compare_with_dblp(entry, dblp_result)
30
- if res.is_match: return res
31
- results.append(res)
32
- except Exception: pass
33
-
34
- # 2. Semantic Scholar (Comprehensive)
35
- if semantic_scholar_fetcher and entry.title:
36
- try:
37
- ss_result = None
38
- if entry.doi:
39
- ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
40
- if not ss_result:
41
- ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
42
-
43
- if ss_result:
44
- res = comparator.compare_with_semantic_scholar(entry, ss_result)
45
- if res.is_match: return res
46
- results.append(res)
47
- except Exception: pass
48
-
49
- # 3. OpenAlex
50
- if openalex_fetcher and entry.title:
51
- try:
52
- oa_result = None
53
- if entry.doi:
54
- oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
55
- if not oa_result:
56
- oa_result = openalex_fetcher.search_by_title(entry.title)
57
-
58
- if oa_result:
59
- res = comparator.compare_with_openalex(entry, oa_result)
60
- if res.is_match: return res
61
- results.append(res)
62
- except Exception: pass
63
-
64
- # 4. CrossRef (Official metadata)
65
- if crossref_fetcher and entry.doi:
66
- try:
67
- crossref_result = crossref_fetcher.search_by_doi(entry.doi)
68
- if crossref_result:
69
- res = comparator.compare_with_crossref(entry, crossref_result)
70
- if res.is_match: return res
71
- results.append(res)
72
- except Exception: pass
73
-
74
- # 5. ArXiv
75
- if arxiv_fetcher:
76
- try:
77
- arxiv_meta = None
78
- if entry.has_arxiv:
79
- arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
80
- elif entry.title:
81
- # Search by title
82
- search_results = arxiv_fetcher.search_by_title(entry.title, max_results=1)
83
- if search_results:
84
- arxiv_meta = search_results[0]
85
-
86
- if arxiv_meta:
87
- res = comparator.compare_with_arxiv(entry, arxiv_meta)
88
- if res.is_match: return res
89
- results.append(res)
90
- except Exception: pass
91
-
92
- # Return the best result (highest confidence) if no perfect match found
93
- if results:
94
- results.sort(key=lambda x: x.confidence, reverse=True)
95
- return results[0]
96
-
97
- # If absolutely nothing found, return None or an 'Unable' result
98
- return comparator.create_unable_result(entry, "No metadata found in any source")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Per-entry metadata verification: parallel multi-source lookup with corroboration.
3
+
4
+ Strategy (in order):
5
+ 1. **Identifier lookups, in parallel**:
6
+ - DOI → CrossRef, Semantic Scholar, OpenAlex
7
+ - arXiv ID → arXiv, Semantic Scholar
8
+ If the bib entry has either, this stage usually returns 2-3 independent
9
+ hits within a few hundred ms. Identifier lookups are far more reliable
10
+ than title search because the identifier is unique.
11
+
12
+ 2. **Title searches across sources, in parallel** (always run as corroboration,
13
+ even if identifiers were found): Semantic Scholar, OpenAlex, DBLP, CrossRef,
14
+ arXiv. Each source returns top-K candidates; we keep the candidate whose
15
+ title most closely matches the bib title.
16
+
17
+ 3. **Score & corroborate**:
18
+ - Pick the result with the highest per-source confidence.
19
+ - If ≥2 sources independently report the same title (sim ≥ 0.95) we
20
+ mark `is_match=True` even when individual confidences are middling
21
+ — multi-source agreement is the single strongest signal.
22
+ - Tightened thresholds: title sim ≥ 0.88 + year diff ≤ 1 (or year empty)
23
+ to declare a single-source match. Single-source matches that disagree
24
+ with corroborating sources are downgraded.
25
+
26
+ The function still returns a single ComparisonResult so the rest of the
27
+ pipeline doesn't change. Extra evidence (sources tried, agreement count) is
28
+ stuffed into the `issues` field as informational notes when relevant.
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import concurrent.futures as cf
33
+ import logging
34
+ from typing import List, Optional, Tuple
35
+
36
+ from src.utils.normalizer import TextNormalizer
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Year tolerance for "match" (preprint vs published often differ by 1y).
41
+ _YEAR_TOL = 1
42
+ # Title similarity required for single-source match.
43
+ _TITLE_MATCH_TIGHT = 0.88
44
+ # Title similarity required to count as "corroborating" another source.
45
+ _TITLE_AGREE = 0.95
46
+
47
+
48
+ def _title_sim(a: str, b: str) -> float:
49
+ if not a or not b:
50
+ return 0.0
51
+ a_n = TextNormalizer.normalize_for_comparison(a)
52
+ b_n = TextNormalizer.normalize_for_comparison(b)
53
+ if not a_n or not b_n:
54
+ return 0.0
55
+ jacc = TextNormalizer.similarity_ratio(a_n, b_n)
56
+ if max(len(a_n), len(b_n)) < 200:
57
+ lev = TextNormalizer.levenshtein_similarity(a_n, b_n)
58
+ return max(jacc, lev)
59
+ return jacc
60
+
61
+
62
+ def _year_close(y1: str, y2: str) -> bool:
63
+ """True if years are missing on either side or within ±1."""
64
+ y1, y2 = (y1 or "").strip(), (y2 or "").strip()
65
+ if not y1 or not y2:
66
+ return True
67
+ try:
68
+ return abs(int(y1[:4]) - int(y2[:4])) <= _YEAR_TOL
69
+ except ValueError:
70
+ return False
71
+
72
+
73
+ def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
74
+ """Pick the candidate whose title most closely matches `bib_title`."""
75
+ best, best_sim = None, 0.0
76
+ for c in candidates:
77
+ sim = _title_sim(bib_title, getattr(c, "title", "") or "")
78
+ if sim > best_sim:
79
+ best, best_sim = c, sim
80
+ return best, best_sim
81
+
82
+
83
  def fetch_and_compare_with_workflow(
84
+ entry,
85
+ workflow_steps, # accepted for API compat; ignored — strategy is fixed
86
+ arxiv_fetcher,
87
+ crossref_fetcher,
88
+ semantic_scholar_fetcher,
89
+ openalex_fetcher,
90
+ dblp_fetcher,
91
+ comparator,
92
  ):
93
+ """Look up `entry` across all available sources in parallel and return a single ComparisonResult."""
94
+ has_doi = bool(getattr(entry, "doi", "") or "")
95
+ has_arxiv = bool(getattr(entry, "has_arxiv", False))
96
+ has_title = bool(getattr(entry, "title", "") or "")
97
+
98
+ if not (has_doi or has_arxiv or has_title):
99
+ return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
100
+
101
+ # ------------------------------------------------------------------ stage 1
102
+ # Tasks are tuples of (source_name, callable returning ComparisonResult or None).
103
+ tasks: list[tuple[str, callable]] = []
104
+
105
+ # Identifier-based lookups (high precision).
106
+ if has_doi and crossref_fetcher:
107
+ def _t_cr_doi(e=entry):
108
+ r = crossref_fetcher.search_by_doi(e.doi)
109
+ return comparator.compare_with_crossref(e, r) if r else None
110
+ tasks.append(("crossref(doi)", _t_cr_doi))
111
+
112
+ if has_doi and semantic_scholar_fetcher:
113
+ def _t_s2_doi(e=entry):
114
+ r = semantic_scholar_fetcher.fetch_by_doi(e.doi)
115
+ return comparator.compare_with_semantic_scholar(e, r) if r else None
116
+ tasks.append(("s2(doi)", _t_s2_doi))
117
+
118
+ if has_doi and openalex_fetcher:
119
+ def _t_oa_doi(e=entry):
120
+ r = openalex_fetcher.fetch_by_doi(e.doi)
121
+ return comparator.compare_with_openalex(e, r) if r else None
122
+ tasks.append(("openalex(doi)", _t_oa_doi))
123
+
124
+ if has_arxiv and arxiv_fetcher:
125
+ def _t_arxiv_id(e=entry):
126
+ r = arxiv_fetcher.fetch_by_id(e.arxiv_id)
127
+ return comparator.compare_with_arxiv(e, r) if r else None
128
+ tasks.append(("arxiv(id)", _t_arxiv_id))
129
+
130
+ if has_arxiv and semantic_scholar_fetcher and not has_doi:
131
+ # If we already queried S2 by DOI we don't double-bill.
132
+ def _t_s2_arxiv(e=entry):
133
+ r = semantic_scholar_fetcher.fetch_by_arxiv_id(e.arxiv_id)
134
+ return comparator.compare_with_semantic_scholar(e, r) if r else None
135
+ tasks.append(("s2(arxiv)", _t_s2_arxiv))
136
+
137
+ # Title-based lookups (always run as corroboration if title available).
138
+ if has_title:
139
+ if semantic_scholar_fetcher and not has_doi and not has_arxiv:
140
+ def _t_s2_title(e=entry):
141
+ cands = semantic_scholar_fetcher.search_by_title_multi(e.title, max_results=5)
142
+ best, _ = _pick_best_candidate(e.title, cands)
143
+ return comparator.compare_with_semantic_scholar(e, best) if best else None
144
+ tasks.append(("s2(title)", _t_s2_title))
145
+
146
+ if openalex_fetcher and not has_doi:
147
+ def _t_oa_title(e=entry):
148
+ cands = openalex_fetcher.search_by_title_multi(e.title, max_results=5)
149
+ best, _ = _pick_best_candidate(e.title, cands)
150
+ return comparator.compare_with_openalex(e, best) if best else None
151
+ tasks.append(("openalex(title)", _t_oa_title))
152
+
153
+ if dblp_fetcher:
154
+ def _t_dblp_title(e=entry):
155
+ cands = dblp_fetcher.search_by_title_multi(e.title, max_results=5)
156
+ best, _ = _pick_best_candidate(e.title, cands)
157
+ return comparator.compare_with_dblp(e, best) if best else None
158
+ tasks.append(("dblp(title)", _t_dblp_title))
159
+
160
+ if crossref_fetcher and not has_doi:
161
+ def _t_cr_title(e=entry):
162
+ cands = crossref_fetcher.search_by_title_multi(e.title, max_results=5)
163
+ best, _ = _pick_best_candidate(e.title, cands)
164
+ return comparator.compare_with_crossref(e, best) if best else None
165
+ tasks.append(("crossref(title)", _t_cr_title))
166
+
167
+ if arxiv_fetcher and not has_arxiv:
168
+ def _t_arxiv_title(e=entry):
169
+ cands = arxiv_fetcher.search_by_title(e.title, max_results=5)
170
+ best, _ = _pick_best_candidate(e.title, cands)
171
+ return comparator.compare_with_arxiv(e, best) if best else None
172
+ tasks.append(("arxiv(title)", _t_arxiv_title))
173
+
174
+ if not tasks:
175
+ return comparator.create_unable_result(entry, "No fetchers configured")
176
+
177
+ # Run in parallel with EARLY EXIT.
178
+ #
179
+ # Strategy:
180
+ # - Submit every task to a pool.
181
+ # - Drain `as_completed` with a SHORT poll deadline.
182
+ # - Stop early as soon as we have one high-confidence match (≥0.85)
183
+ # plus at least one corroborating result whose title aligns.
184
+ # - Hard ceiling: 18s total wall-clock per entry. Whatever finished
185
+ # by then is what we use; the rest is cancelled so we don't pay
186
+ # the slowest-source penalty (a 80s-rate-limited S2 retry, e.g.).
187
+ results: list = []
188
+ sources_tried: list[str] = []
189
+ entry_key = getattr(entry, "key", "<unknown>")
190
+ deadline = __import__("time").monotonic() + 18.0
191
+ HIGH_CONF = 0.85
192
+
193
+ def _have_corroborated(rs: list) -> bool:
194
+ if not rs:
195
+ return False
196
+ rs_sorted = sorted(rs, key=lambda r: r.confidence, reverse=True)
197
+ primary = rs_sorted[0]
198
+ if primary.confidence < HIGH_CONF:
199
+ return False
200
+ for other in rs_sorted[1:]:
201
+ if other.fetched_title and _title_sim(primary.fetched_title,
202
+ other.fetched_title) >= _TITLE_AGREE:
203
+ return True
204
+ return False
205
+
206
+ pool = cf.ThreadPoolExecutor(max_workers=min(8, len(tasks)))
207
+ future_to_name = {pool.submit(fn): name for name, fn in tasks}
208
+ try:
209
+ pending = set(future_to_name)
210
+ while pending:
211
+ remaining = deadline - __import__("time").monotonic()
212
+ if remaining <= 0:
213
+ logger.debug("Entry=%s: 18s deadline reached, %d sources still pending",
214
+ entry_key, len(pending))
215
+ break
216
+ done, pending = cf.wait(pending, timeout=min(remaining, 2.0),
217
+ return_when=cf.FIRST_COMPLETED)
218
+ for fut in done:
219
+ name = future_to_name[fut]
220
+ sources_tried.append(name)
221
+ try:
222
+ r = fut.result(timeout=0)
223
+ except Exception as e:
224
+ logger.warning(
225
+ "Lookup failed for entry=%s source=%s: %s",
226
+ entry_key, name, e, exc_info=True,
227
+ )
228
+ continue
229
+ if r is not None:
230
+ results.append(r)
231
+ if _have_corroborated(results):
232
+ logger.debug("Entry=%s: corroborated early after %d sources", entry_key, len(results))
233
+ break
234
+ finally:
235
+ # Cancel anything still in the queue; threads already running can't
236
+ # be killed, but they'll finish quietly without blocking us.
237
+ for fut in future_to_name:
238
+ if not fut.done():
239
+ fut.cancel()
240
+ pool.shutdown(wait=False, cancel_futures=True)
241
+
242
+ if not results:
243
+ return comparator.create_unable_result(
244
+ entry,
245
+ f"Tried {len(tasks)} sources ({', '.join(sources_tried) or 'none'}) — no metadata returned"
246
+ )
247
+
248
+ # ------------------------------------------------------------------ stage 2: pick + corroborate
249
+ # Sort by confidence; pick top.
250
+ results.sort(key=lambda r: r.confidence, reverse=True)
251
+ primary = results[0]
252
+
253
+ # Count corroborating sources that report a title within sim ≥ _TITLE_AGREE
254
+ # of the primary's fetched_title.
255
+ primary_title = primary.fetched_title
256
+ agree_count = 0
257
+ distinct_sources = set()
258
+ for r in results:
259
+ if r is primary:
260
+ continue
261
+ if not r.fetched_title:
262
+ continue
263
+ if _title_sim(primary_title, r.fetched_title) >= _TITLE_AGREE:
264
+ agree_count += 1
265
+ distinct_sources.add(r.source)
266
+
267
+ # ------------------------------------------------------------------ stage 3: refine match decision
268
+ # Tighten / loosen `is_match` based on corroboration + year tolerance.
269
+ title_ok_tight = primary.title_similarity >= _TITLE_MATCH_TIGHT
270
+ year_ok_loose = _year_close(primary.bib_year, primary.fetched_year)
271
+
272
+ if agree_count >= 1 and title_ok_tight:
273
+ primary.is_match = True
274
+ elif title_ok_tight and primary.author_match and year_ok_loose:
275
+ primary.is_match = True
276
+ elif primary.is_match and not (title_ok_tight and year_ok_loose):
277
+ # Original heuristic said match but our stricter rule disagrees.
278
+ primary.is_match = False
279
+ if not any("stricter check" in i.lower() for i in primary.issues):
280
+ primary.issues.append(
281
+ "Marked unverified by stricter check (title/year tolerance not met)."
282
+ )
283
+
284
+ # Boost / annotate confidence with corroboration signal.
285
+ if agree_count >= 1:
286
+ # Each corroborating source bumps confidence toward 1.0.
287
+ bonus = min(0.25, 0.1 + 0.05 * agree_count)
288
+ primary.confidence = min(1.0, primary.confidence + bonus)
289
+ # Positive note — goes to `notes`, NOT `issues`. Otherwise verified
290
+ # entries would display a misleading "1 issue(s)" badge.
291
+ primary.notes.append(
292
+ f"Corroborated by {agree_count} other source(s): {', '.join(sorted(distinct_sources))}."
293
+ )
294
+
295
+ # Year-only mismatch with otherwise solid match: drop the hard issue
296
+ # and record a soft note instead (preprint/published year difference).
297
+ if (primary.title_match and primary.author_match and not primary.year_match
298
+ and year_ok_loose and primary.bib_year and primary.fetched_year):
299
+ primary.issues = [
300
+ i for i in primary.issues if not i.startswith("Year mismatch")
301
+ ]
302
+ primary.notes.append(
303
+ f"Year differs by ≤1 ({primary.bib_year} vs {primary.fetched_year}) — "
304
+ "likely preprint/published difference, treated as match."
305
+ )
306
+
307
+ return primary
bibguard.yaml CHANGED
@@ -27,6 +27,23 @@ files:
27
  output_dir: "test"
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # ==============================================================================
31
  # 🎓 Conference Template
32
  # ==============================================================================
@@ -59,7 +76,7 @@ bibliography:
59
 
60
  # Relevance Assessment - Use LLM to evaluate if citations match their context
61
  # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
62
- check_relevance: false
63
 
64
  # ==============================================================================
65
  # 📋 Submission Quality Checks
@@ -125,6 +142,21 @@ submission:
125
  # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
126
  anonymization: true
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # ==============================================================================
129
  # 🔍 Metadata Check Workflow
130
  # ==============================================================================
@@ -133,7 +165,7 @@ submission:
133
  # Set enabled: false to skip a particular source.
134
  workflow:
135
  - name: arxiv_id
136
- enabled: true
137
  description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
138
 
139
  - name: crossref_doi
@@ -153,7 +185,7 @@ workflow:
153
  description: "OpenAlex API (broad coverage across disciplines)"
154
 
155
  - name: arxiv_title
156
- enabled: true
157
  description: "Search arXiv by title (fallback when ID unavailable)"
158
 
159
  - name: crossref_title
@@ -171,17 +203,18 @@ llm:
171
  # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
172
  # Each backend requires different setup (API keys, local installation, etc.)
173
  backend: "gemini"
174
-
175
  # Model name (leave empty to use backend default)
176
- # Examples: "gpt-4", "claude-3-opus", "gemini-pro", "llama3"
177
  model: ""
178
 
179
  # API endpoint (leave empty to use backend default)
180
  # Only needed for self-hosted models (vllm, ollama) or custom endpoints
181
  endpoint: ""
182
 
183
- # API key (recommended to use environment variables instead)
184
- # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. in your environment
 
185
  api_key: ""
186
 
187
  # ==============================================================================
 
27
  output_dir: "test"
28
 
29
 
30
+ # ==============================================================================
31
+ # 🌐 Network / Politeness
32
+ # ==============================================================================
33
+ network:
34
+ # Real email used in User-Agent for arXiv/CrossRef/OpenAlex polite-pool requests.
35
+ # arXiv's robots policy asks for a real contact. Strongly recommended to fill in.
36
+ contact_email: ""
37
+
38
+ # Cache HTTP responses to a local SQLite DB. Same `entry.key` won't re-hit network
39
+ # within the TTL window. Hugely speeds up re-runs.
40
+ cache_enabled: true
41
+ cache_ttl_hours: 24
42
+
43
+ # Auto-retry on 429/5xx with exponential backoff.
44
+ retry_total: 5
45
+ retry_backoff_factor: 1.5
46
+
47
  # ==============================================================================
48
  # 🎓 Conference Template
49
  # ==============================================================================
 
76
 
77
  # Relevance Assessment - Use LLM to evaluate if citations match their context
78
  # Requires LLM configuration (see llm section below). Disabled by default due to API costs.
79
+ check_relevance: true
80
 
81
  # ==============================================================================
82
  # 📋 Submission Quality Checks
 
142
  # Detects GitHub links, acknowledgments, self-citations that may reveal author identity
143
  anonymization: true
144
 
145
+ # ==============================================================================
146
+ # 🌐 Network-Bound Bibliography Checks
147
+ # ==============================================================================
148
+ # These run only when explicitly enabled. Both operate solely on bib entries
149
+ # that carry the relevant field (no DOI ⇒ retraction skipped, no url= ⇒
150
+ # liveness skipped). The web UI's "Strict" preset turns both on.
151
+ submission_extra:
152
+ # URL Liveness - HEAD-then-GET every entry.url to find dead links.
153
+ # Slow on large bibs (one HTTP roundtrip per URL); off by default.
154
+ url_liveness: false
155
+
156
+ # Retractions - Look up every entry.doi against CrossRef's update-to relation
157
+ # to flag retracted, withdrawn, or "expression of concern" papers.
158
+ retraction: true
159
+
160
  # ==============================================================================
161
  # 🔍 Metadata Check Workflow
162
  # ==============================================================================
 
165
  # Set enabled: false to skip a particular source.
166
  workflow:
167
  - name: arxiv_id
168
+ enabled: false
169
  description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
170
 
171
  - name: crossref_doi
 
185
  description: "OpenAlex API (broad coverage across disciplines)"
186
 
187
  - name: arxiv_title
188
+ enabled: false
189
  description: "Search arXiv by title (fallback when ID unavailable)"
190
 
191
  - name: crossref_title
 
203
  # Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
204
  # Each backend requires different setup (API keys, local installation, etc.)
205
  backend: "gemini"
206
+
207
  # Model name (leave empty to use backend default)
208
+ # Examples: "gpt-4o-mini", "claude-haiku-4-5-20251001", "gemini-2.5-flash", "llama3"
209
  model: ""
210
 
211
  # API endpoint (leave empty to use backend default)
212
  # Only needed for self-hosted models (vllm, ollama) or custom endpoints
213
  endpoint: ""
214
 
215
+ # API key (RECOMMENDED: leave empty and use environment variables instead)
216
+ # Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY, etc.
217
+ # in your shell. BibGuard will read from $<BACKEND>_API_KEY automatically.
218
  api_key: ""
219
 
220
  # ==============================================================================
main.py CHANGED
@@ -7,8 +7,12 @@ Usage:
7
  python main.py --config my.yaml # Use specified config file
8
  python main.py --init # Create default config file
9
  python main.py --list-templates # List available templates
 
 
 
10
  """
11
  import argparse
 
12
  import sys
13
  from pathlib import Path
14
  from typing import Optional, List
@@ -19,10 +23,17 @@ from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, Duplic
19
  from src.analyzers.llm_evaluator import LLMBackend
20
  from src.report.generator import ReportGenerator, EntryReport
21
  from src.utils.progress import ProgressDisplay
 
 
 
22
  from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
23
  from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
24
  from src.templates.base_template import get_template, get_all_templates
25
  from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
 
 
 
 
26
 
27
 
28
  def main():
@@ -52,8 +63,24 @@ Usage Examples:
52
  action="store_true",
53
  help="List all available conference templates"
54
  )
55
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  args = parser.parse_args()
 
57
 
58
  # Handle --init
59
  if args.init:
@@ -95,25 +122,43 @@ Usage Examples:
95
  print(f"Error: Failed to parse config file: {e}")
96
  sys.exit(1)
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # Validate required fields
99
  mode_dir = bool(config.files.input_dir)
100
-
101
  if mode_dir:
102
  input_dir = config.input_dir_path
103
  if not input_dir.exists() or not input_dir.is_dir():
104
  print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
105
  sys.exit(1)
106
-
107
  tex_files = list(input_dir.rglob("*.tex"))
108
  bib_files = list(input_dir.rglob("*.bib"))
109
-
110
  if not tex_files:
111
  print(f"Error: No .tex files found in {input_dir}")
112
  sys.exit(1)
113
  if not bib_files:
114
  print(f"Error: No .bib files found in {input_dir}")
115
  sys.exit(1)
116
-
117
  config._tex_files = tex_files
118
  config._bib_files = bib_files
119
  else:
@@ -123,7 +168,7 @@ Usage Examples:
123
  if not config.files.tex:
124
  print("Error: tex file path not specified in config")
125
  sys.exit(1)
126
-
127
  # Validate files exist
128
  if not config.bib_path.exists():
129
  print(f"Error: Bib file does not exist: {config.bib_path}")
@@ -131,10 +176,29 @@ Usage Examples:
131
  if not config.tex_path.exists():
132
  print(f"Error: TeX file does not exist: {config.tex_path}")
133
  sys.exit(1)
134
-
135
  config._tex_files = [config.tex_path]
136
  config._bib_files = [config.bib_path]
137
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  # Load template if specified
139
  template = None
140
  if config.template:
@@ -143,12 +207,12 @@ Usage Examples:
143
  print(f"Error: Unknown template: {config.template}")
144
  print("Use --list-templates to see available templates")
145
  sys.exit(1)
146
-
147
  # Run the checker
148
  try:
149
  run_checker(config, template)
150
  except KeyboardInterrupt:
151
- print("\n\nCancelled")
152
  sys.exit(130)
153
  except Exception as e:
154
  print(f"\nError: {e}")
@@ -250,32 +314,62 @@ def run_checker(config: BibGuardConfig, template=None):
250
  [str(f) for f in config._tex_files]
251
  )
252
 
 
 
 
 
 
 
 
253
  # Run submission quality checks
254
  submission_results = []
255
- enabled_checkers = config.submission.get_enabled_checkers()
256
-
 
 
257
  for checker_name in enabled_checkers:
258
  if checker_name in CHECKER_REGISTRY:
259
  checker = CHECKER_REGISTRY[checker_name]()
260
  for tex_path_str, content in tex_contents.items():
261
- results = checker.check(content, {})
262
- # Tag results with file path
263
- for r in results:
264
- r.file_path = tex_path_str
265
  submission_results.extend(results)
266
-
267
  # Set results in report generator for summary calculation
268
  report_gen.set_submission_results(submission_results, template)
269
-
270
  # Check for duplicates (silent)
271
  if bib_config.check_duplicates and duplicate_detector:
272
  duplicate_groups = duplicate_detector.find_duplicates(entries)
273
  report_gen.set_duplicate_groups(duplicate_groups)
274
-
275
  # Check missing citations (silent)
276
  if bib_config.check_usage and usage_checker:
277
  missing = usage_checker.get_missing_entries(entries)
278
  report_gen.set_missing_citations(missing)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  # Process entries
281
 
@@ -347,41 +441,46 @@ def run_checker(config: BibGuardConfig, template=None):
347
  # Determine number of workers (max 10 to avoid overwhelming APIs)
348
  max_workers = min(10, len(entries))
349
 
 
350
  with progress.progress_context(len(entries), "Processing bibliography") as prog:
351
  # Use ThreadPoolExecutor for parallel processing
352
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
353
  # Submit all tasks
354
  future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
355
-
356
  # Process completed tasks
357
- for future in as_completed(future_to_entry):
358
- entry = future_to_entry[future]
359
- try:
360
- entry_report, comparison_result = future.result()
361
-
362
- # Thread-safe progress update
363
- with progress_lock:
364
- report_gen.add_entry_report(entry_report)
365
-
366
- # Update progress
367
- if comparison_result and comparison_result.is_match:
368
- prog.mark_success()
369
- elif comparison_result and comparison_result.has_issues:
370
- prog.mark_warning()
371
- else:
 
 
 
 
 
 
 
 
372
  prog.mark_error()
373
-
374
- completed_count[0] += 1
375
- prog.update(entry.key, "Done", 1)
376
-
377
- except Exception as e:
378
- with progress_lock:
379
- prog.mark_error()
380
- progress.print_error(f"Error processing {entry.key}: {e}")
381
- completed_count[0] += 1
382
- prog.update(entry.key, "Failed", 1)
383
-
384
- # Summary will be printed at the very end
385
 
386
  # Generate reports and organize outputs (silent)
387
 
@@ -395,61 +494,55 @@ def run_checker(config: BibGuardConfig, template=None):
395
  shutil.copy2(bib_path, output_dir / bib_path.name)
396
  for tex_path in config._tex_files:
397
  shutil.copy2(tex_path, output_dir / tex_path.name)
398
- # 1. Bibliography Report
399
- bib_report_path = output_dir / "bibliography_report.md"
400
- report_gen.save_bibliography_report(str(bib_report_path))
401
-
402
- # 2. LaTeX Quality Report
403
- if submission_results:
404
- latex_report_path = output_dir / "latex_quality_report.md"
405
- report_gen.save_latex_quality_report(
406
- str(latex_report_path),
407
- submission_results,
408
- template
409
- )
410
-
411
- # 3. Line-by-Line Report
412
- from src.report.line_report import generate_line_report
413
- line_report_path = output_dir / "line_by_line_report.md"
414
-
415
- # For multiple files, we generate one big report with sections
416
- all_line_reports = []
417
- for tex_path_str, content in tex_contents.items():
418
- file_results = [r for r in submission_results if r.file_path == tex_path_str]
419
- if not file_results:
420
- continue
421
-
422
- from src.report.line_report import LineByLineReportGenerator
423
- gen = LineByLineReportGenerator(content, tex_path_str)
424
- gen.add_results(file_results)
425
- all_line_reports.append(gen.generate())
426
-
427
- if all_line_reports:
428
- with open(line_report_path, 'w', encoding='utf-8') as f:
429
- f.write("\n\n".join(all_line_reports))
430
-
431
- # 4. Clean bib file (if generated earlier)
432
  if bib_config.check_usage and usage_checker:
433
  used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
434
  if used_entries:
435
  try:
436
  keys_to_keep = {entry.key for entry in used_entries}
437
- # If multiple bibs, we merge them into one cleaned file
438
- # or just use the first one if it's single mode.
439
- # For now, let's just use a default name if multiple.
440
  if len(config._bib_files) == 1:
441
  clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
442
  bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
443
  else:
444
  clean_bib_path = output_dir / "merged_only_used.bib"
445
- # We need a way to filter multiple files into one.
446
- # BibParser.filter_file currently takes one input.
447
- # Let's just write all used entries to a new file.
448
  with open(clean_bib_path, 'w', encoding='utf-8') as f:
449
  for entry in used_entries:
450
- f.write(entry.raw + "\n\n")
451
  except Exception as e:
452
- pass
 
 
 
453
 
454
  # Print beautiful console summary
455
  if not config.output.quiet:
@@ -461,85 +554,40 @@ def fetch_and_compare_with_workflow(
461
  entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
462
  semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
463
  ):
464
- """Fetch metadata from online sources using the configured workflow."""
465
- from src.utils.normalizer import TextNormalizer
466
-
467
- all_results = []
468
- enabled_steps = workflow_config.get_enabled_steps()
469
-
470
- for step in enabled_steps:
471
- result = None
472
-
473
- if step.name == "arxiv_id" and entry.has_arxiv and arxiv_fetcher:
474
- arxiv_meta = arxiv_fetcher.fetch_by_id(entry.arxiv_id)
475
- if arxiv_meta:
476
- result = comparator.compare_with_arxiv(entry, arxiv_meta)
477
-
478
- elif step.name == "crossref_doi" and entry.doi and crossref_fetcher:
479
- crossref_result = crossref_fetcher.search_by_doi(entry.doi)
480
- if crossref_result:
481
- result = comparator.compare_with_crossref(entry, crossref_result)
482
-
483
- elif step.name == "semantic_scholar" and entry.title and semantic_scholar_fetcher:
484
- ss_result = None
485
- if entry.doi:
486
- ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
487
- if not ss_result:
488
- ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
489
- if ss_result:
490
- result = comparator.compare_with_semantic_scholar(entry, ss_result)
491
-
492
- elif step.name == "dblp" and entry.title and dblp_fetcher:
493
- dblp_result = dblp_fetcher.search_by_title(entry.title)
494
- if dblp_result:
495
- result = comparator.compare_with_dblp(entry, dblp_result)
496
-
497
- elif step.name == "openalex" and entry.title and openalex_fetcher:
498
- oa_result = None
499
- if entry.doi:
500
- oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
501
- if not oa_result:
502
- oa_result = openalex_fetcher.search_by_title(entry.title)
503
- if oa_result:
504
- result = comparator.compare_with_openalex(entry, oa_result)
505
-
506
- elif step.name == "arxiv_title" and entry.title and arxiv_fetcher:
507
- results = arxiv_fetcher.search_by_title(entry.title, max_results=3)
508
- if results:
509
- best_result = None
510
- best_sim = 0.0
511
- norm1 = TextNormalizer.normalize_for_comparison(entry.title)
512
-
513
- for r in results:
514
- norm2 = TextNormalizer.normalize_for_comparison(r.title)
515
- sim = TextNormalizer.similarity_ratio(norm1, norm2)
516
- if sim > best_sim:
517
- best_sim = sim
518
- best_result = r
519
-
520
- if best_result and best_sim > 0.5:
521
- result = comparator.compare_with_arxiv(entry, best_result)
522
-
523
- elif step.name == "crossref_title" and entry.title and crossref_fetcher:
524
- crossref_result = crossref_fetcher.search_by_title(entry.title)
525
- if crossref_result:
526
- result = comparator.compare_with_crossref(entry, crossref_result)
527
-
528
- elif step.name == "google_scholar" and entry.title and scholar_fetcher:
529
  scholar_result = scholar_fetcher.search_by_title(entry.title)
530
  if scholar_result:
531
- result = comparator.compare_with_scholar(entry, scholar_result)
532
-
533
- if result:
534
- all_results.append(result)
535
- if result.is_match:
536
- return result
537
-
538
- if all_results:
539
- all_results.sort(key=lambda r: r.confidence, reverse=True)
540
- return all_results[0]
541
-
542
- return comparator.create_unable_result(entry, "Unable to find this paper in any data source")
543
 
544
 
545
  def get_abstract(entry, comparison_result, arxiv_fetcher):
 
7
  python main.py --config my.yaml # Use specified config file
8
  python main.py --init # Create default config file
9
  python main.py --list-templates # List available templates
10
+ python main.py --quick # Skip network-bound metadata/relevance/url checks
11
+ python main.py --format json,html,markdown
12
+ python main.py --verbose # DEBUG-level logs to stderr
13
  """
14
  import argparse
15
+ import logging
16
  import sys
17
  from pathlib import Path
18
  from typing import Optional, List
 
23
  from src.analyzers.llm_evaluator import LLMBackend
24
  from src.report.generator import ReportGenerator, EntryReport
25
  from src.utils.progress import ProgressDisplay
26
+ from src.utils.logging_setup import setup as setup_logging
27
+ from src.utils import http as http_layer
28
+ from src.utils.validation import validate_bib, validate_tex, format_report
29
  from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
30
  from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
31
  from src.templates.base_template import get_template, get_all_templates
32
  from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
33
+ from src.checkers.retraction_checker import RetractionChecker
34
+ from src.checkers.url_checker import URLChecker
35
+
36
+ logger = logging.getLogger("bibguard")
37
 
38
 
39
  def main():
 
63
  action="store_true",
64
  help="List all available conference templates"
65
  )
66
+ parser.add_argument(
67
+ "--quick",
68
+ action="store_true",
69
+ help="Skip network-bound checks (metadata, retraction, URL liveness, LLM)",
70
+ )
71
+ parser.add_argument(
72
+ "--format",
73
+ default=None,
74
+ help="Comma-separated list of output formats (markdown, html, json). Defaults to config.",
75
+ )
76
+ parser.add_argument(
77
+ "--verbose", "-v",
78
+ action="store_true",
79
+ help="Verbose (DEBUG) logging to stderr",
80
+ )
81
+
82
  args = parser.parse_args()
83
+ setup_logging("DEBUG" if args.verbose else None)
84
 
85
  # Handle --init
86
  if args.init:
 
122
  print(f"Error: Failed to parse config file: {e}")
123
  sys.exit(1)
124
 
125
+ # CLI overrides
126
+ if args.quick:
127
+ config.bibliography.check_metadata = False
128
+ config.bibliography.check_relevance = False
129
+ config.submission_extra.url_liveness = False
130
+ config.submission_extra.retraction = False
131
+ if args.format:
132
+ config.output.formats = [s.strip() for s in args.format.split(",") if s.strip()]
133
+
134
+ # Configure shared HTTP layer (retry + cache + UA)
135
+ http_layer.configure(
136
+ contact_email=config.network.contact_email,
137
+ cache_enabled=config.network.cache_enabled,
138
+ cache_ttl_hours=config.network.cache_ttl_hours,
139
+ retry_total=config.network.retry_total,
140
+ retry_backoff_factor=config.network.retry_backoff_factor,
141
+ )
142
+
143
  # Validate required fields
144
  mode_dir = bool(config.files.input_dir)
145
+
146
  if mode_dir:
147
  input_dir = config.input_dir_path
148
  if not input_dir.exists() or not input_dir.is_dir():
149
  print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
150
  sys.exit(1)
151
+
152
  tex_files = list(input_dir.rglob("*.tex"))
153
  bib_files = list(input_dir.rglob("*.bib"))
154
+
155
  if not tex_files:
156
  print(f"Error: No .tex files found in {input_dir}")
157
  sys.exit(1)
158
  if not bib_files:
159
  print(f"Error: No .bib files found in {input_dir}")
160
  sys.exit(1)
161
+
162
  config._tex_files = tex_files
163
  config._bib_files = bib_files
164
  else:
 
168
  if not config.files.tex:
169
  print("Error: tex file path not specified in config")
170
  sys.exit(1)
171
+
172
  # Validate files exist
173
  if not config.bib_path.exists():
174
  print(f"Error: Bib file does not exist: {config.bib_path}")
 
176
  if not config.tex_path.exists():
177
  print(f"Error: TeX file does not exist: {config.tex_path}")
178
  sys.exit(1)
179
+
180
  config._tex_files = [config.tex_path]
181
  config._bib_files = [config.bib_path]
182
+
183
+ # Pre-flight content validation (R6)
184
+ any_fatal = False
185
+ for bp in config._bib_files:
186
+ rep = validate_bib(bp)
187
+ msg = format_report(rep, label=bp.name)
188
+ if msg:
189
+ print(msg)
190
+ if not rep.ok:
191
+ any_fatal = True
192
+ for tp in config._tex_files:
193
+ rep = validate_tex(tp)
194
+ msg = format_report(rep, label=tp.name)
195
+ if msg:
196
+ print(msg)
197
+ if not rep.ok:
198
+ any_fatal = True
199
+ if any_fatal:
200
+ sys.exit(1)
201
+
202
  # Load template if specified
203
  template = None
204
  if config.template:
 
207
  print(f"Error: Unknown template: {config.template}")
208
  print("Use --list-templates to see available templates")
209
  sys.exit(1)
210
+
211
  # Run the checker
212
  try:
213
  run_checker(config, template)
214
  except KeyboardInterrupt:
215
+ print("\n\n[BibGuard] Interrupted. Partial reports (if any) are in the output dir.")
216
  sys.exit(130)
217
  except Exception as e:
218
  print(f"\nError: {e}")
 
314
  [str(f) for f in config._tex_files]
315
  )
316
 
317
+ # Build the per-checker config dict (glossary, template, etc.)
318
+ checker_config = {
319
+ "glossary_preferred": config.glossary.preferred,
320
+ "glossary_acronyms": config.glossary.acronyms,
321
+ "template": template,
322
+ }
323
+
324
  # Run submission quality checks
325
  submission_results = []
326
+ enabled_checkers = list(config.submission.get_enabled_checkers())
327
+ if template is not None and "template" not in enabled_checkers:
328
+ enabled_checkers.append("template")
329
+
330
  for checker_name in enabled_checkers:
331
  if checker_name in CHECKER_REGISTRY:
332
  checker = CHECKER_REGISTRY[checker_name]()
333
  for tex_path_str, content in tex_contents.items():
334
+ # Run the checker on this file. We deliberately do NOT tag
335
+ # `r.file_path = tex_path_str` because user-facing reports
336
+ # never expose local tex paths (basename or full).
337
+ results = checker.check(content, checker_config)
338
  submission_results.extend(results)
339
+
340
  # Set results in report generator for summary calculation
341
  report_gen.set_submission_results(submission_results, template)
342
+
343
  # Check for duplicates (silent)
344
  if bib_config.check_duplicates and duplicate_detector:
345
  duplicate_groups = duplicate_detector.find_duplicates(entries)
346
  report_gen.set_duplicate_groups(duplicate_groups)
347
+
348
  # Check missing citations (silent)
349
  if bib_config.check_usage and usage_checker:
350
  missing = usage_checker.get_missing_entries(entries)
351
  report_gen.set_missing_citations(missing)
352
+
353
+ # Retraction lookups (F1)
354
+ if config.submission_extra.retraction:
355
+ try:
356
+ findings = RetractionChecker().check_entries(entries)
357
+ report_gen.set_retraction_findings(findings)
358
+ if findings:
359
+ logger.info("Retraction check found %d flagged entries", len(findings))
360
+ except Exception as e:
361
+ logger.debug("Retraction check failed: %s", e)
362
+
363
+ # URL liveness (F2)
364
+ if config.submission_extra.url_liveness:
365
+ try:
366
+ url_findings = URLChecker().check_entries(entries)
367
+ report_gen.set_url_findings(url_findings)
368
+ broken = sum(1 for f in url_findings if f.status != "ok")
369
+ if broken:
370
+ logger.info("URL liveness check: %d broken URL(s)", broken)
371
+ except Exception as e:
372
+ logger.debug("URL liveness check failed: %s", e)
373
 
374
  # Process entries
375
 
 
441
  # Determine number of workers (max 10 to avoid overwhelming APIs)
442
  max_workers = min(10, len(entries))
443
 
444
+ interrupted = False
445
  with progress.progress_context(len(entries), "Processing bibliography") as prog:
446
  # Use ThreadPoolExecutor for parallel processing
447
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
448
  # Submit all tasks
449
  future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
450
+
451
  # Process completed tasks
452
+ try:
453
+ for future in as_completed(future_to_entry):
454
+ entry = future_to_entry[future]
455
+ try:
456
+ entry_report, comparison_result = future.result()
457
+
458
+ # Thread-safe progress update
459
+ with progress_lock:
460
+ report_gen.add_entry_report(entry_report)
461
+
462
+ # Update progress
463
+ if comparison_result and comparison_result.is_match:
464
+ prog.mark_success()
465
+ elif comparison_result and comparison_result.has_issues:
466
+ prog.mark_warning()
467
+ else:
468
+ prog.mark_error()
469
+
470
+ completed_count[0] += 1
471
+ prog.update(entry.key, "Done", 1)
472
+
473
+ except Exception as e:
474
+ with progress_lock:
475
  prog.mark_error()
476
+ progress.print_error(f"Error processing {entry.key}: {e}")
477
+ completed_count[0] += 1
478
+ prog.update(entry.key, "Failed", 1)
479
+ except KeyboardInterrupt:
480
+ interrupted = True
481
+ logger.warning("Interrupted by user; cancelling remaining work and saving partial reports")
482
+ for f in future_to_entry:
483
+ f.cancel()
 
 
 
 
484
 
485
  # Generate reports and organize outputs (silent)
486
 
 
494
  shutil.copy2(bib_path, output_dir / bib_path.name)
495
  for tex_path in config._tex_files:
496
  shutil.copy2(tex_path, output_dir / tex_path.name)
497
+ requested_formats = {f.lower() for f in (config.output.formats or ["markdown", "html"])}
498
+
499
+ # 1. Bibliography Report (markdown)
500
+ if "markdown" in requested_formats:
501
+ bib_report_path = output_dir / "bibliography_report.md"
502
+ report_gen.save_bibliography_report(str(bib_report_path))
503
+
504
+ # 2. LaTeX Quality Report (markdown)
505
+ if submission_results:
506
+ latex_report_path = output_dir / "latex_quality_report.md"
507
+ report_gen.save_latex_quality_report(
508
+ str(latex_report_path),
509
+ submission_results,
510
+ template,
511
+ )
512
+
513
+ # 4. Self-contained HTML (★)
514
+ if "html" in requested_formats:
515
+ try:
516
+ report_gen.save_html(str(output_dir / "report.html"))
517
+ except Exception as e:
518
+ logger.warning("Failed to write HTML report: %s", e)
519
+
520
+ # 5. JSON output
521
+ if "json" in requested_formats:
522
+ try:
523
+ report_gen.save_json(str(output_dir / "report.json"))
524
+ except Exception as e:
525
+ logger.warning("Failed to write JSON report: %s", e)
526
+
527
+ # 6. Clean bib file (if generated earlier)
 
 
 
528
  if bib_config.check_usage and usage_checker:
529
  used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
530
  if used_entries:
531
  try:
532
  keys_to_keep = {entry.key for entry in used_entries}
 
 
 
533
  if len(config._bib_files) == 1:
534
  clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
535
  bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
536
  else:
537
  clean_bib_path = output_dir / "merged_only_used.bib"
 
 
 
538
  with open(clean_bib_path, 'w', encoding='utf-8') as f:
539
  for entry in used_entries:
540
+ f.write(getattr(entry, "raw", "") + "\n\n")
541
  except Exception as e:
542
+ logger.debug("Failed to write cleaned bib file: %s", e)
543
+
544
+ if interrupted:
545
+ print("[BibGuard] Saved partial reports for completed entries.")
546
 
547
  # Print beautiful console summary
548
  if not config.output.quiet:
 
554
  entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
555
  semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
556
  ):
557
+ """
558
+ Fetch metadata across all configured sources and pick the best match.
559
+
560
+ Delegates the heavy lifting to ``app_helper.fetch_and_compare_with_workflow``,
561
+ which runs identifier-based and title-based lookups in parallel and uses
562
+ cross-source corroboration to decide is_match. Google Scholar is consulted
563
+ only as a last-resort fallback because scraping is fragile and frequently
564
+ blocked.
565
+ """
566
+ from app_helper import fetch_and_compare_with_workflow as _parallel_lookup
567
+
568
+ primary = _parallel_lookup(
569
+ entry, workflow_config, arxiv_fetcher, crossref_fetcher,
570
+ semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator,
571
+ )
572
+
573
+ if primary and primary.source != "unable":
574
+ return primary
575
+
576
+ # Last-resort Google Scholar fallback (web scraping; frequently blocked).
577
+ if entry.title and scholar_fetcher:
578
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  scholar_result = scholar_fetcher.search_by_title(entry.title)
580
  if scholar_result:
581
+ return comparator.compare_with_scholar(entry, scholar_result)
582
+ except Exception as e:
583
+ logger.warning(
584
+ "Google Scholar fallback failed for entry=%s: %s",
585
+ getattr(entry, "key", "<unknown>"), e, exc_info=True,
586
+ )
587
+
588
+ return primary or comparator.create_unable_result(
589
+ entry, "Unable to find this paper in any data source"
590
+ )
 
 
591
 
592
 
593
  def get_abstract(entry, comparison_result, arxiv_fetcher):
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  gradio>=6.0.0
2
  bibtexparser>=1.4.0
3
  requests>=2.31.0
 
4
  beautifulsoup4>=4.12.0
5
  rich>=13.7.0
6
  Unidecode>=1.3.0
 
1
  gradio>=6.0.0
2
  bibtexparser>=1.4.0
3
  requests>=2.31.0
4
+ requests-cache>=1.2.0
5
  beautifulsoup4>=4.12.0
6
  rich>=13.7.0
7
  Unidecode>=1.3.0
scripts/install-hook.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Install a git pre-commit hook that runs BibGuard in --quick mode whenever
3
+ # the staged changes touch .bib or .tex files.
4
+ #
5
+ # Usage (run from the repo root that contains your paper, NOT BibGuard's repo):
6
+ # bash /path/to/BibGuard/scripts/install-hook.sh
7
+ #
8
+ # Skip the hook for one commit: git commit --no-verify
9
+ set -euo pipefail
10
+
11
+ if ! git rev-parse --git-dir >/dev/null 2>&1; then
12
+ echo "Error: not inside a git repo." >&2
13
+ exit 1
14
+ fi
15
+
16
+ HOOK_DIR="$(git rev-parse --git-dir)/hooks"
17
+ HOOK="$HOOK_DIR/pre-commit"
18
+
19
+ # Locate BibGuard's main.py — we assume this script lives in BibGuard/scripts/.
20
+ BIBGUARD_DIR="$(cd "$(dirname "$0")/.." && pwd)"
21
+ MAIN_PY="$BIBGUARD_DIR/main.py"
22
+ if [[ ! -f "$MAIN_PY" ]]; then
23
+ echo "Error: cannot locate BibGuard main.py at $MAIN_PY" >&2
24
+ exit 1
25
+ fi
26
+
27
+ mkdir -p "$HOOK_DIR"
28
+
29
+ if [[ -f "$HOOK" ]]; then
30
+ echo "A pre-commit hook already exists at $HOOK"
31
+ echo "Backing it up to $HOOK.bibguard-backup"
32
+ mv "$HOOK" "$HOOK.bibguard-backup"
33
+ fi
34
+
35
+ cat >"$HOOK" <<EOF
36
+ #!/usr/bin/env bash
37
+ # BibGuard pre-commit hook (auto-generated)
38
+ # Runs only if staged files include .tex or .bib.
39
+ set -e
40
+
41
+ if git diff --cached --name-only --diff-filter=ACM | grep -qE '\.(tex|bib)$'; then
42
+ echo "[BibGuard] Running quick checks on staged paper sources…"
43
+ python "$MAIN_PY" --quick || {
44
+ echo
45
+ echo "[BibGuard] Issues found. Fix or run: git commit --no-verify to skip."
46
+ exit 1
47
+ }
48
+ fi
49
+ EOF
50
+
51
+ chmod +x "$HOOK"
52
+ echo "Installed BibGuard pre-commit hook at: $HOOK"
53
+ echo "It will run only when staged files include .tex or .bib."
src/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (202 Bytes)
 
src/__pycache__/__init__.cpython-313.pyc DELETED
Binary file (190 Bytes)
 
src/analyzers/__pycache__/__init__.cpython-313.pyc DELETED
Binary file (464 Bytes)
 
src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc DELETED
Binary file (8.29 kB)
 
src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc DELETED
Binary file (5.4 kB)
 
src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc DELETED
Binary file (14.3 kB)
 
src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc DELETED
Binary file (18.9 kB)
 
src/analyzers/__pycache__/retraction_checker.cpython-313.pyc DELETED
Binary file (4.94 kB)
 
src/analyzers/__pycache__/url_validator.cpython-313.pyc DELETED
Binary file (8.3 kB)
 
src/analyzers/__pycache__/usage_checker.cpython-313.pyc DELETED
Binary file (4.4 kB)
 
src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc DELETED
Binary file (13.3 kB)
 
src/analyzers/llm_evaluator.py CHANGED
@@ -3,14 +3,18 @@ LLM-based citation relevance evaluator.
3
  Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
4
  """
5
  import json
 
6
  import re
7
- from dataclasses import dataclass
8
- from typing import Optional, Dict, Any
 
9
  from enum import Enum
10
  import os
11
 
12
  import requests
13
 
 
 
14
 
15
  class LLMBackend(Enum):
16
  OPENAI = "openai"
@@ -21,6 +25,52 @@ class LLMBackend(Enum):
21
  DEEPSEEK = "deepseek"
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @dataclass
25
  class EvaluationResult:
26
  """Result of LLM citation evaluation."""
@@ -30,15 +80,16 @@ class EvaluationResult:
30
  explanation: str
31
  context_used: str
32
  abstract_used: str
 
33
  line_number: Optional[int] = None
34
  file_path: Optional[str] = None
35
  error: Optional[str] = None
36
-
37
  @property
38
  def score_label(self) -> str:
39
  labels = {
40
  1: "Not Relevant",
41
- 2: "Marginally Relevant",
42
  3: "Somewhat Relevant",
43
  4: "Relevant",
44
  5: "Highly Relevant"
@@ -49,7 +100,7 @@ class EvaluationResult:
49
  class LLMEvaluator:
50
  """Evaluates citation relevance using LLM."""
51
 
52
- PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant.
53
 
54
  ## Citation Context (from the manuscript):
55
  {context}
@@ -62,23 +113,28 @@ Evaluate the relevance and appropriateness of this citation. Consider:
62
  1. Does the citation support the claim being made in the context?
63
  2. Is the cited paper's topic related to the discussion?
64
  3. Is this citation necessary, or could it be replaced with a more relevant one?
 
 
 
 
 
 
 
 
 
 
65
 
66
  ## Response Format:
67
- Provide your response in the following JSON format:
68
  {{
69
- "relevance_score": <1-5 integer>,
70
- "is_relevant": <true/false>,
71
- "explanation": "<brief explanation in 1-2 sentences>"
 
72
  }}
73
 
74
- Score guide:
75
- - 1: Not relevant at all
76
- - 2: Marginally relevant
77
- - 3: Somewhat relevant
78
- - 4: Relevant and appropriate
79
- - 5: Highly relevant and essential
80
-
81
- STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text."""
82
 
83
  def __init__(
84
  self,
@@ -90,28 +146,32 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
90
  self.backend = backend
91
  self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
92
 
93
- # Set defaults based on backend
94
  if backend == LLMBackend.OPENAI:
95
  self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
96
- self.model = model or "gpt-5-mini"
97
  elif backend == LLMBackend.ANTHROPIC:
98
  self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
99
- self.model = model or "claude-4.5-haiku"
100
  elif backend == LLMBackend.DEEPSEEK:
101
  self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
102
  self.model = model or "deepseek-chat"
103
  elif backend == LLMBackend.OLLAMA:
104
  self.endpoint = endpoint or "http://localhost:11434/api/generate"
105
- self.model = model or "Qwen/qwen3-4B-Instruct-2507"
106
  elif backend == LLMBackend.VLLM:
107
  self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
108
- self.model = model or "Qwen/qwen3-4B-Instruct-2507"
109
  elif backend == LLMBackend.GEMINI:
110
  self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
111
- self.model = model or "gemini-2.5-flash-lite"
112
 
 
 
 
 
113
  def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
114
- """Evaluate citation relevance."""
115
  if not context or not abstract:
116
  return EvaluationResult(
117
  entry_key=entry_key,
@@ -122,34 +182,51 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
122
  abstract_used=abstract,
123
  error="Missing context or abstract for evaluation"
124
  )
125
-
126
- # Don't truncate - preserve full context and abstract
127
  prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
128
-
129
- try:
130
- if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
131
- response = self._call_openai_compatible(prompt)
132
- elif self.backend == LLMBackend.ANTHROPIC:
133
- response = self._call_anthropic(prompt)
134
- elif self.backend == LLMBackend.OLLAMA:
135
- response = self._call_ollama(prompt)
136
- elif self.backend == LLMBackend.GEMINI:
137
- response = self._call_gemini(prompt)
138
- else:
139
- raise ValueError(f"Unknown backend: {self.backend}")
140
-
141
- return self._parse_response(entry_key, response, context, abstract)
142
-
143
- except Exception as e:
144
- return EvaluationResult(
145
- entry_key=entry_key,
146
- relevance_score=0,
147
- is_relevant=False,
148
- explanation="",
149
- context_used=context,
150
- abstract_used=abstract,
151
- error=str(e)
152
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  def _call_openai_compatible(self, prompt: str) -> str:
155
  """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
@@ -272,24 +349,77 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
272
  return parts[0].get("text", "")
273
  return ""
274
 
275
- def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
276
- """Parse LLM response."""
277
- # Try to extract JSON from response
278
- json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL)
279
-
280
- data = {}
281
- if not json_match:
282
- # Try to parse the whole response as JSON
283
- try:
284
- data = json.loads(response.strip())
285
- except json.JSONDecodeError:
286
- pass
287
- else:
 
 
 
 
 
 
 
 
 
 
 
 
288
  try:
289
- data = json.loads(json_match.group())
 
 
290
  except json.JSONDecodeError:
291
  pass
292
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  if not data:
294
  return EvaluationResult(
295
  entry_key=entry_key,
@@ -301,27 +431,44 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
301
  error="Failed to parse LLM response as JSON"
302
  )
303
 
304
- # Extract fields
305
- relevance_score = data.get("relevance_score", 0)
306
- if isinstance(relevance_score, str):
307
- try:
308
- relevance_score = int(relevance_score)
309
- except ValueError:
310
- relevance_score = 0
311
-
312
- is_relevant = data.get("is_relevant", False)
313
  if isinstance(is_relevant, str):
314
- is_relevant = is_relevant.lower() in ("true", "yes", "1")
315
-
316
- explanation = data.get("explanation", "")
317
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  return EvaluationResult(
319
  entry_key=entry_key,
320
  relevance_score=relevance_score,
321
  is_relevant=is_relevant,
322
  explanation=explanation,
323
  context_used=context,
324
- abstract_used=abstract
 
325
  )
326
 
327
  def test_connection(self) -> bool:
@@ -371,6 +518,7 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
371
  }
372
  response = requests.post(url, json=payload, timeout=10)
373
  return response.status_code == 200
374
- except Exception:
 
375
  return False
376
  return False
 
3
  Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
4
  """
5
  import json
6
+ import logging
7
  import re
8
+ import time
9
+ from dataclasses import dataclass, field
10
+ from typing import Optional, Dict, Any, Tuple
11
  from enum import Enum
12
  import os
13
 
14
  import requests
15
 
16
+ logger = logging.getLogger(__name__)
17
+
18
 
19
  class LLMBackend(Enum):
20
  OPENAI = "openai"
 
25
  DEEPSEEK = "deepseek"
26
 
27
 
28
+ # Map backend → environment variable name for the API key.
29
+ _BACKEND_ENV = {
30
+ LLMBackend.OPENAI: "OPENAI_API_KEY",
31
+ LLMBackend.ANTHROPIC: "ANTHROPIC_API_KEY",
32
+ LLMBackend.GEMINI: "GEMINI_API_KEY",
33
+ LLMBackend.DEEPSEEK: "DEEPSEEK_API_KEY",
34
+ LLMBackend.VLLM: "VLLM_API_KEY",
35
+ LLMBackend.OLLAMA: "", # local, no key
36
+ }
37
+
38
+ # Order in which we auto-detect a usable backend when the user hasn't picked
39
+ # one explicitly. Cheapest/fastest first.
40
+ _AUTODETECT_ORDER = [
41
+ LLMBackend.GEMINI,
42
+ LLMBackend.OPENAI,
43
+ LLMBackend.DEEPSEEK,
44
+ LLMBackend.ANTHROPIC,
45
+ LLMBackend.OLLAMA,
46
+ ]
47
+
48
+
49
+ def autodetect_backend() -> Optional[Tuple[LLMBackend, str]]:
50
+ """
51
+ Find the first backend that has credentials in the environment.
52
+
53
+ Returns (backend, api_key) or None. For Ollama we attempt a localhost
54
+ probe so users with `ollama serve` running get auto-selected with no
55
+ config.
56
+ """
57
+ for backend in _AUTODETECT_ORDER:
58
+ env = _BACKEND_ENV.get(backend, "")
59
+ if env:
60
+ key = os.environ.get(env, "").strip()
61
+ if key:
62
+ return backend, key
63
+ elif backend == LLMBackend.OLLAMA:
64
+ # Local probe — small timeout so absence isn't painful.
65
+ try:
66
+ r = requests.get("http://localhost:11434/api/tags", timeout=1.0)
67
+ if r.status_code == 200:
68
+ return backend, ""
69
+ except requests.RequestException:
70
+ continue
71
+ return None
72
+
73
+
74
  @dataclass
75
  class EvaluationResult:
76
  """Result of LLM citation evaluation."""
 
80
  explanation: str
81
  context_used: str
82
  abstract_used: str
83
+ citation_role: str = "" # baseline | method | dataset | counterexample | survey | motivation | other
84
  line_number: Optional[int] = None
85
  file_path: Optional[str] = None
86
  error: Optional[str] = None
87
+
88
  @property
89
  def score_label(self) -> str:
90
  labels = {
91
  1: "Not Relevant",
92
+ 2: "Marginally Relevant",
93
  3: "Somewhat Relevant",
94
  4: "Relevant",
95
  5: "Highly Relevant"
 
100
  class LLMEvaluator:
101
  """Evaluates citation relevance using LLM."""
102
 
103
+ PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant, and identify the citation's role in the manuscript.
104
 
105
  ## Citation Context (from the manuscript):
106
  {context}
 
113
  1. Does the citation support the claim being made in the context?
114
  2. Is the cited paper's topic related to the discussion?
115
  3. Is this citation necessary, or could it be replaced with a more relevant one?
116
+ 4. What is the *role* of this citation in the manuscript?
117
+
118
+ ## Citation roles (pick exactly one):
119
+ - "baseline": cited paper is used/compared as a baseline or prior method.
120
+ - "method": cited paper introduces a method that the manuscript builds on or uses directly.
121
+ - "dataset": cited paper provides a dataset/benchmark the manuscript uses.
122
+ - "counterexample": cited to show a contrary finding or argue against.
123
+ - "survey": cited as a survey/overview reference.
124
+ - "motivation": cited to motivate the problem (background, application, statistics).
125
+ - "other": none of the above clearly applies.
126
 
127
  ## Response Format:
128
+ Respond with ONE JSON object, no other text:
129
  {{
130
+ "relevance_score": <integer 1-5>,
131
+ "is_relevant": <true|false>,
132
+ "citation_role": "<one of: baseline|method|dataset|counterexample|survey|motivation|other>",
133
+ "explanation": "<1-2 sentences>"
134
  }}
135
 
136
+ Score guide: 1=Not relevant, 2=Marginally, 3=Somewhat, 4=Relevant, 5=Highly relevant.
137
+ STRICTLY FOLLOW THE JSON FORMAT."""
 
 
 
 
 
 
138
 
139
  def __init__(
140
  self,
 
146
  self.backend = backend
147
  self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
148
 
149
+ # Set defaults based on backend (cheap, fast models that exist)
150
  if backend == LLMBackend.OPENAI:
151
  self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
152
+ self.model = model or "gpt-4o-mini"
153
  elif backend == LLMBackend.ANTHROPIC:
154
  self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
155
+ self.model = model or "claude-haiku-4-5-20251001"
156
  elif backend == LLMBackend.DEEPSEEK:
157
  self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
158
  self.model = model or "deepseek-chat"
159
  elif backend == LLMBackend.OLLAMA:
160
  self.endpoint = endpoint or "http://localhost:11434/api/generate"
161
+ self.model = model or "qwen2.5:3b-instruct"
162
  elif backend == LLMBackend.VLLM:
163
  self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
164
+ self.model = model or "Qwen/Qwen2.5-3B-Instruct"
165
  elif backend == LLMBackend.GEMINI:
166
  self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
167
+ self.model = model or "gemini-2.5-flash"
168
 
169
+ # Retry config for transient LLM failures (rate limits, server errors, JSON issues).
170
+ MAX_ATTEMPTS = 3
171
+ RETRY_BASE_DELAY = 1.5 # seconds, exponential
172
+
173
  def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
174
+ """Evaluate citation relevance with retries on transient errors."""
175
  if not context or not abstract:
176
  return EvaluationResult(
177
  entry_key=entry_key,
 
182
  abstract_used=abstract,
183
  error="Missing context or abstract for evaluation"
184
  )
185
+
 
186
  prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
187
+
188
+ last_err: Optional[str] = None
189
+ for attempt in range(1, self.MAX_ATTEMPTS + 1):
190
+ try:
191
+ if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
192
+ response = self._call_openai_compatible(prompt)
193
+ elif self.backend == LLMBackend.ANTHROPIC:
194
+ response = self._call_anthropic(prompt)
195
+ elif self.backend == LLMBackend.OLLAMA:
196
+ response = self._call_ollama(prompt)
197
+ elif self.backend == LLMBackend.GEMINI:
198
+ response = self._call_gemini(prompt)
199
+ else:
200
+ raise ValueError(f"Unknown backend: {self.backend}")
201
+
202
+ parsed = self._parse_response(entry_key, response, context, abstract)
203
+ # Successful structured parse → return.
204
+ if parsed.error is None:
205
+ return parsed
206
+ # JSON parse failed — retry with the same prompt; LLM jitter
207
+ # often resolves on a second pass.
208
+ last_err = parsed.error
209
+ except requests.exceptions.RequestException as e:
210
+ last_err = f"network: {e}"
211
+ # Transient: retry with backoff.
212
+ except Exception as e:
213
+ last_err = str(e)
214
+
215
+ if attempt < self.MAX_ATTEMPTS:
216
+ delay = self.RETRY_BASE_DELAY * (2 ** (attempt - 1))
217
+ logger.debug("LLM attempt %d/%d failed (%s); retrying in %.1fs",
218
+ attempt, self.MAX_ATTEMPTS, last_err, delay)
219
+ time.sleep(delay)
220
+
221
+ return EvaluationResult(
222
+ entry_key=entry_key,
223
+ relevance_score=0,
224
+ is_relevant=False,
225
+ explanation="",
226
+ context_used=context,
227
+ abstract_used=abstract,
228
+ error=last_err or "Unknown error after retries"
229
+ )
230
 
231
  def _call_openai_compatible(self, prompt: str) -> str:
232
  """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
 
349
  return parts[0].get("text", "")
350
  return ""
351
 
352
+ @staticmethod
353
+ def _extract_json_object(text: str) -> Optional[dict]:
354
+ """
355
+ Robust JSON extraction. Handles:
356
+ - bare JSON
357
+ - fenced ```json ... ``` blocks
358
+ - JSON embedded in surrounding prose
359
+ - nested objects (the simple `\\{[^{}]*\\}` regex misses these)
360
+ """
361
+ if not text:
362
+ return None
363
+ s = text.strip()
364
+
365
+ # Direct parse
366
+ try:
367
+ obj = json.loads(s)
368
+ if isinstance(obj, dict):
369
+ return obj
370
+ except json.JSONDecodeError:
371
+ pass
372
+
373
+ # Strip Markdown code fences (```json ... ``` or ``` ... ```)
374
+ fence_match = re.search(r"```(?:json)?\s*(.*?)```", s, re.DOTALL | re.IGNORECASE)
375
+ if fence_match:
376
+ inner = fence_match.group(1).strip()
377
  try:
378
+ obj = json.loads(inner)
379
+ if isinstance(obj, dict):
380
+ return obj
381
  except json.JSONDecodeError:
382
  pass
383
+ s = inner # fall through to brace-balance scan on inner
384
+
385
+ # Brace-balanced scan: find the first complete top-level {...}.
386
+ start = s.find("{")
387
+ while start != -1:
388
+ depth = 0
389
+ in_str = False
390
+ esc = False
391
+ for i in range(start, len(s)):
392
+ ch = s[i]
393
+ if esc:
394
+ esc = False
395
+ continue
396
+ if ch == "\\":
397
+ esc = True
398
+ continue
399
+ if ch == '"':
400
+ in_str = not in_str
401
+ continue
402
+ if in_str:
403
+ continue
404
+ if ch == "{":
405
+ depth += 1
406
+ elif ch == "}":
407
+ depth -= 1
408
+ if depth == 0:
409
+ chunk = s[start:i + 1]
410
+ try:
411
+ obj = json.loads(chunk)
412
+ if isinstance(obj, dict):
413
+ return obj
414
+ except json.JSONDecodeError:
415
+ break
416
+ start = s.find("{", start + 1)
417
+ return None
418
+
419
+ def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
420
+ """Parse LLM response with robust JSON extraction."""
421
+ data = self._extract_json_object(response) or {}
422
+
423
  if not data:
424
  return EvaluationResult(
425
  entry_key=entry_key,
 
431
  error="Failed to parse LLM response as JSON"
432
  )
433
 
434
+ # Extract & validate fields
435
+ raw_score = data.get("relevance_score", data.get("score", 0))
436
+ try:
437
+ relevance_score = int(float(raw_score))
438
+ except (TypeError, ValueError):
439
+ relevance_score = 0
440
+ relevance_score = max(0, min(5, relevance_score))
441
+
442
+ is_relevant = data.get("is_relevant", relevance_score >= 4)
443
  if isinstance(is_relevant, str):
444
+ is_relevant = is_relevant.strip().lower() in ("true", "yes", "1", "y")
445
+
446
+ explanation = str(data.get("explanation", data.get("reason", ""))).strip()
447
+ citation_role = str(data.get("citation_role", data.get("role", ""))).strip().lower() or "other"
448
+ if citation_role not in {"baseline", "method", "dataset", "counterexample", "survey", "motivation", "other"}:
449
+ citation_role = "other"
450
+
451
+ # Sanity: a score of 0 means the LLM didn't actually return one — flag it.
452
+ if relevance_score == 0:
453
+ return EvaluationResult(
454
+ entry_key=entry_key,
455
+ relevance_score=0,
456
+ is_relevant=False,
457
+ explanation=explanation or response,
458
+ context_used=context,
459
+ abstract_used=abstract,
460
+ citation_role=citation_role,
461
+ error="LLM did not return a usable relevance_score",
462
+ )
463
+
464
  return EvaluationResult(
465
  entry_key=entry_key,
466
  relevance_score=relevance_score,
467
  is_relevant=is_relevant,
468
  explanation=explanation,
469
  context_used=context,
470
+ abstract_used=abstract,
471
+ citation_role=citation_role,
472
  )
473
 
474
  def test_connection(self) -> bool:
 
518
  }
519
  response = requests.post(url, json=payload, timeout=10)
520
  return response.status_code == 200
521
+ except Exception as e:
522
+ logger.debug("LLM test_connection failed for %s: %s", self.backend.value, e)
523
  return False
524
  return False
src/analyzers/metadata_comparator.py CHANGED
@@ -18,30 +18,41 @@ from ..utils.normalizer import TextNormalizer
18
  class ComparisonResult:
19
  """Result of comparing bib entry with fetched metadata."""
20
  entry_key: str
21
-
22
  # Title comparison
23
  title_match: bool
24
  title_similarity: float
25
  bib_title: str
26
  fetched_title: str
27
-
28
  # Author comparison
29
  author_match: bool
30
  author_similarity: float
31
  bib_authors: list[str]
32
  fetched_authors: list[str]
33
-
34
  # Year comparison
35
  year_match: bool
36
  bib_year: str
37
  fetched_year: str
38
-
39
  # Overall assessment
40
  is_match: bool
41
  confidence: float
42
  issues: list[str]
43
  source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
44
-
 
 
 
 
 
 
 
 
 
 
 
45
  @property
46
  def has_issues(self) -> bool:
47
  return len(self.issues) > 0
@@ -60,7 +71,17 @@ class MetadataComparator:
60
  def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
61
  """Compare bib entry with arXiv metadata."""
62
  issues = []
63
-
 
 
 
 
 
 
 
 
 
 
64
  # Compare titles
65
  bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
66
  arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
@@ -114,7 +135,8 @@ class MetadataComparator:
114
  is_match=is_match,
115
  confidence=confidence,
116
  issues=issues,
117
- source="arxiv"
 
118
  )
119
 
120
  def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
 
18
  class ComparisonResult:
19
  """Result of comparing bib entry with fetched metadata."""
20
  entry_key: str
21
+
22
  # Title comparison
23
  title_match: bool
24
  title_similarity: float
25
  bib_title: str
26
  fetched_title: str
27
+
28
  # Author comparison
29
  author_match: bool
30
  author_similarity: float
31
  bib_authors: list[str]
32
  fetched_authors: list[str]
33
+
34
  # Year comparison
35
  year_match: bool
36
  bib_year: str
37
  fetched_year: str
38
+
39
  # Overall assessment
40
  is_match: bool
41
  confidence: float
42
  issues: list[str]
43
  source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
44
+
45
+ # F4: When an arXiv preprint has a published counterpart, surface it here.
46
+ published_version_hint: str = "" # e.g. "Also published at NeurIPS 2024 (doi:10.1145/...)"
47
+
48
+ # Positive / informational notes that should NOT be counted as issues
49
+ # (e.g. "corroborated by S2", "year differs by ≤1, treated as match").
50
+ notes: list[str] = None # type: ignore[assignment]
51
+
52
+ def __post_init__(self):
53
+ if self.notes is None:
54
+ self.notes = []
55
+
56
  @property
57
  def has_issues(self) -> bool:
58
  return len(self.issues) > 0
 
71
  def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
72
  """Compare bib entry with arXiv metadata."""
73
  issues = []
74
+
75
+ # F4: Extract a published-version hint if arXiv records it.
76
+ published_hint = ""
77
+ if arxiv_meta.journal_ref or arxiv_meta.doi:
78
+ parts = []
79
+ if arxiv_meta.journal_ref:
80
+ parts.append(arxiv_meta.journal_ref.strip())
81
+ if arxiv_meta.doi:
82
+ parts.append(f"doi:{arxiv_meta.doi.strip()}")
83
+ published_hint = "Has a published version — " + " | ".join(parts)
84
+
85
  # Compare titles
86
  bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
87
  arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
 
135
  is_match=is_match,
136
  confidence=confidence,
137
  issues=issues,
138
+ source="arxiv",
139
+ published_version_hint=published_hint,
140
  )
141
 
142
  def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
src/checkers/__init__.py CHANGED
@@ -11,6 +11,7 @@ from .consistency_checker import ConsistencyChecker
11
  from .citation_quality_checker import CitationQualityChecker
12
  from .equation_checker import EquationChecker
13
  from .acronym_checker import AcronymChecker
 
14
 
15
  __all__ = [
16
  'BaseChecker',
@@ -27,6 +28,7 @@ __all__ = [
27
  'CitationQualityChecker',
28
  'EquationChecker',
29
  'AcronymChecker',
 
30
  ]
31
 
32
 
@@ -43,6 +45,7 @@ CHECKER_REGISTRY = {
43
  'citation_quality': CitationQualityChecker,
44
  'equation': EquationChecker,
45
  'acronym': AcronymChecker,
 
46
  }
47
 
48
 
 
11
  from .citation_quality_checker import CitationQualityChecker
12
  from .equation_checker import EquationChecker
13
  from .acronym_checker import AcronymChecker
14
+ from .template_checker import TemplateChecker
15
 
16
  __all__ = [
17
  'BaseChecker',
 
28
  'CitationQualityChecker',
29
  'EquationChecker',
30
  'AcronymChecker',
31
+ 'TemplateChecker',
32
  ]
33
 
34
 
 
45
  'citation_quality': CitationQualityChecker,
46
  'equation': EquationChecker,
47
  'acronym': AcronymChecker,
48
+ 'template': TemplateChecker,
49
  }
50
 
51
 
src/checkers/__pycache__/__init__.cpython-313.pyc DELETED
Binary file (2.2 kB)
 
src/checkers/__pycache__/acronym_checker.cpython-313.pyc DELETED
Binary file (10.8 kB)
 
src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc DELETED
Binary file (6.14 kB)
 
src/checkers/__pycache__/anonymization_checker.cpython-313.pyc DELETED
Binary file (8.38 kB)
 
src/checkers/__pycache__/base.cpython-313.pyc DELETED
Binary file (7.68 kB)
 
src/checkers/__pycache__/caption_checker.cpython-313.pyc DELETED
Binary file (5.63 kB)
 
src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc DELETED
Binary file (5.41 kB)
 
src/checkers/__pycache__/consistency_checker.cpython-313.pyc DELETED
Binary file (11 kB)
 
src/checkers/__pycache__/equation_checker.cpython-313.pyc DELETED
Binary file (5.62 kB)
 
src/checkers/__pycache__/formatting_checker.cpython-313.pyc DELETED
Binary file (9.45 kB)
 
src/checkers/__pycache__/number_checker.cpython-313.pyc DELETED
Binary file (3.8 kB)
 
src/checkers/__pycache__/reference_checker.cpython-313.pyc DELETED
Binary file (8.3 kB)
 
src/checkers/__pycache__/sentence_checker.cpython-313.pyc DELETED
Binary file (4.36 kB)
 
src/checkers/acronym_checker.py CHANGED
@@ -87,23 +87,30 @@ class AcronymChecker(BaseChecker):
87
  }
88
 
89
  def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
 
90
  results = []
91
-
 
 
 
 
 
 
92
  # Remove comments using base class method
93
  content = self._remove_comments(tex_content)
94
-
95
  # Find all defined acronyms with their positions
96
  defined_acronyms = self._find_definitions(content)
97
-
98
  # Find all acronym usages (excluding special contexts)
99
  all_usages = self._find_all_usages(content)
100
-
101
  # NEW: Find potential full forms for each acronym
102
  acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
103
-
104
  # Check for undefined acronyms (only those with matching full forms)
105
  for acronym, positions in all_usages.items():
106
- if acronym in self.COMMON_ACRONYMS:
107
  continue
108
 
109
  # Skip if no matching full form found in document
 
87
  }
88
 
89
  def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
90
+ config = config or {}
91
  results = []
92
+
93
+ # Project glossary: skip-set + auto-defined map
94
+ user_acronyms = dict(config.get('glossary_acronyms', {}) or {})
95
+ # All user-supplied acronyms are considered "known/defined" — never warn about them.
96
+ glossary_skip = {k.upper() for k in user_acronyms.keys()}
97
+ common_plus_glossary = self.COMMON_ACRONYMS | glossary_skip
98
+
99
  # Remove comments using base class method
100
  content = self._remove_comments(tex_content)
101
+
102
  # Find all defined acronyms with their positions
103
  defined_acronyms = self._find_definitions(content)
104
+
105
  # Find all acronym usages (excluding special contexts)
106
  all_usages = self._find_all_usages(content)
107
+
108
  # NEW: Find potential full forms for each acronym
109
  acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
110
+
111
  # Check for undefined acronyms (only those with matching full forms)
112
  for acronym, positions in all_usages.items():
113
+ if acronym in common_plus_glossary:
114
  continue
115
 
116
  # Skip if no matching full form found in document
src/checkers/ai_artifacts_checker.py CHANGED
@@ -125,7 +125,7 @@ class AIArtifactsChecker(BaseChecker):
125
  severity=CheckSeverity.ERROR,
126
  message=f"{description} detected",
127
  line_number=line_num,
128
- line_content=line.strip()[:100],
129
  suggestion="Remove AI-generated conversational text"
130
  ))
131
  break # One match per line for this category
@@ -139,7 +139,7 @@ class AIArtifactsChecker(BaseChecker):
139
  severity=CheckSeverity.WARNING,
140
  message=f"{description}: '{match.group(0)[:50]}'",
141
  line_number=line_num,
142
- line_content=line.strip()[:100],
143
  suggestion="Replace placeholder with actual content or remove"
144
  ))
145
 
@@ -169,7 +169,7 @@ class AIArtifactsChecker(BaseChecker):
169
  severity=CheckSeverity.INFO,
170
  message=f"Possible {description} in LaTeX",
171
  line_number=line_num,
172
- line_content=line.strip()[:100],
173
  suggestion="Convert to LaTeX formatting or remove if unintentional"
174
  ))
175
 
 
125
  severity=CheckSeverity.ERROR,
126
  message=f"{description} detected",
127
  line_number=line_num,
128
+ line_content=line.strip(),
129
  suggestion="Remove AI-generated conversational text"
130
  ))
131
  break # One match per line for this category
 
139
  severity=CheckSeverity.WARNING,
140
  message=f"{description}: '{match.group(0)[:50]}'",
141
  line_number=line_num,
142
+ line_content=line.strip(),
143
  suggestion="Replace placeholder with actual content or remove"
144
  ))
145
 
 
169
  severity=CheckSeverity.INFO,
170
  message=f"Possible {description} in LaTeX",
171
  line_number=line_num,
172
+ line_content=line.strip(),
173
  suggestion="Convert to LaTeX formatting or remove if unintentional"
174
  ))
175
 
src/checkers/anonymization_checker.py CHANGED
@@ -79,7 +79,7 @@ class AnonymizationChecker(BaseChecker):
79
  severity=CheckSeverity.WARNING,
80
  message=f"{desc} in comment (could be revealed when compiling)",
81
  line_number=line_num,
82
- line_content=line.strip()[:100],
83
  suggestion="Remove or anonymize URL even in comments"
84
  ))
85
  continue
@@ -91,7 +91,7 @@ class AnonymizationChecker(BaseChecker):
91
  severity=CheckSeverity.ERROR,
92
  message=f"{desc} may reveal author identity",
93
  line_number=line_num,
94
- line_content=line.strip()[:100],
95
  suggestion="Replace with anonymized URL or remove for review"
96
  ))
97
 
@@ -112,7 +112,7 @@ class AnonymizationChecker(BaseChecker):
112
  severity=CheckSeverity.WARNING,
113
  message="Potentially self-revealing citation pattern",
114
  line_number=line_num,
115
- line_content=line.strip()[:100],
116
  suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
117
  ))
118
 
 
79
  severity=CheckSeverity.WARNING,
80
  message=f"{desc} in comment (could be revealed when compiling)",
81
  line_number=line_num,
82
+ line_content=line.strip(),
83
  suggestion="Remove or anonymize URL even in comments"
84
  ))
85
  continue
 
91
  severity=CheckSeverity.ERROR,
92
  message=f"{desc} may reveal author identity",
93
  line_number=line_num,
94
+ line_content=line.strip(),
95
  suggestion="Replace with anonymized URL or remove for review"
96
  ))
97
 
 
112
  severity=CheckSeverity.WARNING,
113
  message="Potentially self-revealing citation pattern",
114
  line_number=line_num,
115
+ line_content=line.strip(),
116
  suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
117
  ))
118
 
src/checkers/base.py CHANGED
@@ -29,7 +29,10 @@ class CheckResult:
29
  line_content: Optional[str] = None
30
  suggestion: Optional[str] = None
31
  file_path: Optional[str] = None
32
-
 
 
 
33
  def to_dict(self) -> dict:
34
  return {
35
  'checker': self.checker_name,
@@ -39,7 +42,8 @@ class CheckResult:
39
  'line': self.line_number,
40
  'content': self.line_content,
41
  'suggestion': self.suggestion,
42
- 'file_path': self.file_path
 
43
  }
44
 
45
 
@@ -178,7 +182,8 @@ class BaseChecker(ABC):
178
  message: str,
179
  line_number: Optional[int] = None,
180
  line_content: Optional[str] = None,
181
- suggestion: Optional[str] = None
 
182
  ) -> CheckResult:
183
  """Helper to create a CheckResult with this checker's name."""
184
  return CheckResult(
@@ -188,6 +193,7 @@ class BaseChecker(ABC):
188
  message=message,
189
  line_number=line_number,
190
  line_content=line_content,
191
- suggestion=suggestion
 
192
  )
193
 
 
29
  line_content: Optional[str] = None
30
  suggestion: Optional[str] = None
31
  file_path: Optional[str] = None
32
+ # Substring of line_content that triggered the issue. The renderer wraps
33
+ # this in <mark> so the user can see *where* in the line to look.
34
+ match_text: Optional[str] = None
35
+
36
  def to_dict(self) -> dict:
37
  return {
38
  'checker': self.checker_name,
 
42
  'line': self.line_number,
43
  'content': self.line_content,
44
  'suggestion': self.suggestion,
45
+ 'file_path': self.file_path,
46
+ 'match_text': self.match_text,
47
  }
48
 
49
 
 
182
  message: str,
183
  line_number: Optional[int] = None,
184
  line_content: Optional[str] = None,
185
+ suggestion: Optional[str] = None,
186
+ match_text: Optional[str] = None,
187
  ) -> CheckResult:
188
  """Helper to create a CheckResult with this checker's name."""
189
  return CheckResult(
 
193
  message=message,
194
  line_number=line_number,
195
  line_content=line_content,
196
+ suggestion=suggestion,
197
+ match_text=match_text,
198
  )
199
 
src/checkers/citation_quality_checker.py CHANGED
@@ -124,7 +124,7 @@ class CitationQualityChecker(BaseChecker):
124
  severity=CheckSeverity.WARNING,
125
  message="Appears to be hardcoded citation instead of \\cite",
126
  line_number=line_num,
127
- line_content=line.strip()[:80],
128
  suggestion="Use \\cite{} for proper bibliography management"
129
  ))
130
 
 
124
  severity=CheckSeverity.WARNING,
125
  message="Appears to be hardcoded citation instead of \\cite",
126
  line_number=line_num,
127
+ line_content=line.strip(),
128
  suggestion="Use \\cite{} for proper bibliography management"
129
  ))
130
 
src/checkers/consistency_checker.py CHANGED
@@ -149,25 +149,45 @@ class ConsistencyChecker(BaseChecker):
149
  ]
150
 
151
  def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
 
152
  results = []
153
-
154
  # Remove comments
155
  content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
156
  content_lower = content.lower()
157
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # Check for known variant inconsistencies
159
- for canonical, variants in self.KNOWN_VARIANTS.items():
160
  found_forms = []
161
-
162
  # Check canonical form
163
  if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
164
  found_forms.append(canonical)
165
-
166
  # Check variants
167
  for variant in variants:
168
  if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
169
  found_forms.append(variant)
170
-
171
  if len(found_forms) > 1:
172
  results.append(self._create_result(
173
  passed=False,
 
149
  ]
150
 
151
  def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
152
+ config = config or {}
153
  results = []
154
+
155
  # Remove comments
156
  content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
157
  content_lower = content.lower()
158
+
159
+ # Merge user glossary preferred terms into the variant table
160
+ glossary_preferred = list(config.get('glossary_preferred', []))
161
+ merged_variants = dict(self.KNOWN_VARIANTS)
162
+ for term in glossary_preferred:
163
+ term = (term or "").strip()
164
+ if not term:
165
+ continue
166
+ # Generate plausible variants: hyphen ↔ space ↔ collapsed; lower vs canonical
167
+ forms = {term}
168
+ if "-" in term:
169
+ forms.add(term.replace("-", " "))
170
+ forms.add(term.replace("-", ""))
171
+ if " " in term:
172
+ forms.add(term.replace(" ", "-"))
173
+ forms.add(term.replace(" ", ""))
174
+ forms.discard(term)
175
+ if forms:
176
+ merged_variants.setdefault(term, []).extend(sorted(forms))
177
+
178
  # Check for known variant inconsistencies
179
+ for canonical, variants in merged_variants.items():
180
  found_forms = []
181
+
182
  # Check canonical form
183
  if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
184
  found_forms.append(canonical)
185
+
186
  # Check variants
187
  for variant in variants:
188
  if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
189
  found_forms.append(variant)
190
+
191
  if len(found_forms) > 1:
192
  results.append(self._create_result(
193
  passed=False,
src/checkers/formatting_checker.py CHANGED
@@ -41,9 +41,6 @@ class FormattingChecker(BaseChecker):
41
  '^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
42
  }
43
 
44
- # Multiple blank lines pattern (3 or more blank lines)
45
- MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
46
-
47
  def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
48
  results = []
49
  lines = tex_content.split('\n')
@@ -66,8 +63,9 @@ class FormattingChecker(BaseChecker):
66
  severity=CheckSeverity.INFO,
67
  message="Citation without non-breaking space",
68
  line_number=line_num,
69
- line_content=line.strip()[:100],
70
- suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
 
71
  ))
72
 
73
  # Track citation styles
@@ -90,40 +88,6 @@ class FormattingChecker(BaseChecker):
90
  suggestion="Consider using consistent citation style throughout"
91
  ))
92
 
93
- # Check for multiple blank lines (3 or more)
94
- for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
95
- line_num = self._find_line_number(tex_content, match.start())
96
- # Count how many blank lines
97
- blank_count = match.group(0).count('\n') - 1
98
-
99
- # Get context: the line before, blank lines, and the line after
100
- start_pos = match.start()
101
- end_pos = match.end()
102
-
103
- # Find the line before the blank lines
104
- prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
105
- prev_line_end = start_pos
106
- prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
107
-
108
- # Find the line after the blank lines
109
- next_line_end = tex_content.find('\n', end_pos)
110
- if next_line_end == -1:
111
- next_line_end = len(tex_content)
112
- next_line = tex_content[end_pos:next_line_end].rstrip()
113
-
114
- # Create visual representation with warning markers
115
- blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
116
- line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
117
-
118
- results.append(self._create_result(
119
- passed=False,
120
- severity=CheckSeverity.INFO,
121
- message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
122
- line_number=line_num,
123
- line_content=line_content,
124
- suggestion="Reduce to single blank line or use \\vspace"
125
- ))
126
-
127
  # Check for common issues with special characters
128
  results.extend(self._check_special_chars(tex_content, lines))
129
 
@@ -159,8 +123,9 @@ class FormattingChecker(BaseChecker):
159
  severity=CheckSeverity.WARNING,
160
  message="Unescaped & outside tabular/math environment",
161
  line_number=line_num,
162
- line_content=line.strip()[:100],
163
- suggestion="Use \\& to escape"
 
164
  ))
165
 
166
  return results
 
41
  '^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
42
  }
43
 
 
 
 
44
  def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
45
  results = []
46
  lines = tex_content.split('\n')
 
63
  severity=CheckSeverity.INFO,
64
  message="Citation without non-breaking space",
65
  line_number=line_num,
66
+ line_content=line.strip(),
67
+ suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')",
68
+ match_text=match.group(0),
69
  ))
70
 
71
  # Track citation styles
 
88
  suggestion="Consider using consistent citation style throughout"
89
  ))
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  # Check for common issues with special characters
92
  results.extend(self._check_special_chars(tex_content, lines))
93
 
 
123
  severity=CheckSeverity.WARNING,
124
  message="Unescaped & outside tabular/math environment",
125
  line_number=line_num,
126
+ line_content=line.strip(),
127
+ suggestion="Use \\& to escape",
128
+ match_text=match.group(0),
129
  ))
130
 
131
  return results
src/checkers/retraction_checker.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Bibliography-level checker that flags retracted DOIs.
3
+
4
+ Unlike the LaTeX-line checkers in src/checkers/, this one operates on parsed
5
+ BibEntry objects, not on a tex_content string. main.py / app.py invoke it
6
+ directly via `check_entries(entries)`.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import concurrent.futures
11
+ import logging
12
+ from dataclasses import dataclass
13
+ from typing import Iterable, List
14
+
15
+ from src.fetchers.retraction_fetcher import RetractionFetcher, RetractionResult
16
+ from src.parsers.bib_parser import BibEntry
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class RetractionFinding:
23
+ entry_key: str
24
+ doi: str
25
+ result: RetractionResult
26
+
27
+
28
+ class RetractionChecker:
29
+ """Concurrent batch retraction lookup."""
30
+
31
+ def __init__(self, max_workers: int = 6):
32
+ self.fetcher = RetractionFetcher()
33
+ self.max_workers = max_workers
34
+
35
+ def check_entries(self, entries: Iterable[BibEntry]) -> List[RetractionFinding]:
36
+ """Look up retraction status for every entry that has a DOI."""
37
+ with_doi = [e for e in entries if getattr(e, "doi", "")]
38
+ if not with_doi:
39
+ return []
40
+
41
+ findings: List[RetractionFinding] = []
42
+
43
+ def _one(entry: BibEntry):
44
+ res = self.fetcher.check(entry.doi)
45
+ return entry, res
46
+
47
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as ex:
48
+ for entry, res in ex.map(_one, with_doi):
49
+ if res is None:
50
+ continue
51
+ if res.is_retracted or res.update_type:
52
+ findings.append(RetractionFinding(entry.key, entry.doi, res))
53
+ return findings
src/checkers/sentence_checker.py CHANGED
@@ -76,7 +76,7 @@ class SentenceChecker(BaseChecker):
76
  severity=CheckSeverity.INFO,
77
  message=message,
78
  line_number=line_num,
79
- line_content=line.strip()[:80]
80
  ))
81
  break # One per line
82
 
 
76
  severity=CheckSeverity.INFO,
77
  message=message,
78
  line_number=line_num,
79
+ line_content=line.strip()
80
  ))
81
  break # One per line
82
 
src/checkers/template_checker.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conference-template conformance checker.
3
+
4
+ Reads the rich rule set defined in :mod:`src.templates.base_template` and runs
5
+ per-venue checks against the LaTeX source. Each rule fragment lives in its own
6
+ small private method so adding new conferences (or new rules) doesn't bloat the
7
+ public ``check`` method.
8
+
9
+ Severity convention used here:
10
+
11
+ * ``ERROR`` — desk-reject material if uncorrected (NeurIPS missing checklist,
12
+ ACL missing Limitations, double-blind \\author leak).
13
+ * ``WARNING`` — likely a real problem but might be a false positive (style
14
+ package mismatch, identifying URL).
15
+ * ``INFO`` — soft reminder that something MUST happen later (camera-ready
16
+ sections, lay summaries, font requirements, page-limit
17
+ estimation that the .tex source can't actually verify).
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ from typing import List, Optional
23
+
24
+ from .base import BaseChecker, CheckResult, CheckSeverity
25
+
26
+
27
+ # ------------------------------------------------------------------ helpers ---
28
+
29
+ # Match \section{X}, \subsection{X}, \paragraph{X}, optionally starred,
30
+ # allowing an optional [short] argument before the {body}.
31
+ def _section_pattern(name: str) -> re.Pattern:
32
+ return re.compile(
33
+ r'\\(?:section|subsection|paragraph)\*?\s*(?:\[[^\]]*\])?\s*\{[^}]*?'
34
+ + re.escape(name) + r'[^}]*\}',
35
+ re.IGNORECASE,
36
+ )
37
+
38
+
39
+ # Domains/URL patterns that strongly de-anonymize an author. Whitelisted
40
+ # domains (which legitimately appear in CV/ML papers without leaking identity)
41
+ # are excluded.
42
+ _IDENTIFYING_URL_PATTERNS = [
43
+ re.compile(r'\bgithub\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
44
+ re.compile(r'\b[A-Za-z0-9_\-]+\.github\.io\b', re.IGNORECASE),
45
+ re.compile(r'\bgitlab\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
46
+ re.compile(r'\bbitbucket\.org/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
47
+ re.compile(r'\b(?:huggingface\.co|wandb\.ai)/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
48
+ re.compile(r'\b(?:linkedin|twitter|x)\.com/[A-Za-z0-9_\-]+', re.IGNORECASE),
49
+ ]
50
+
51
+ # URLs that are explicitly anonymous-friendly and should NOT be flagged.
52
+ _ANONYMOUS_URL_HINTS = re.compile(
53
+ r'(anonymous|anon|blind|review|submission|4open\.science)', re.IGNORECASE,
54
+ )
55
+
56
+ # Capture URLs from \url{...}, \href{...}{...}, and bare http(s)://...
57
+ _URL_FROM_TEX = re.compile(
58
+ r'\\(?:url|href)\s*\{([^}]+)\}|(?<![/\w])(https?://[^\s,)\\]+)',
59
+ )
60
+
61
+ # Acknowledgments macros / sections used by various templates.
62
+ _ACK_PATTERNS = [
63
+ re.compile(r'\\section\*?\s*\{\s*Acknowledg\w*\s*\}', re.IGNORECASE),
64
+ re.compile(r'\\acknowledgments?\s*\{', re.IGNORECASE),
65
+ re.compile(r'\\begin\{acks\}', re.IGNORECASE),
66
+ ]
67
+
68
+ # NeurIPS Paper Checklist markers — the official template either calls
69
+ # \input{neurips_paper_checklist} or includes a \section*{NeurIPS Paper Checklist}.
70
+ _NEURIPS_CHECKLIST_PATTERNS = [
71
+ re.compile(r'\\section\*?\s*\{[^}]*Paper\s+Checklist[^}]*\}', re.IGNORECASE),
72
+ re.compile(r'\\input\{[^}]*paper[_\-]?checklist[^}]*\}', re.IGNORECASE),
73
+ re.compile(r'\\input\{[^}]*neurips[_\-]?\d{0,4}[_\-]?checklist[^}]*\}', re.IGNORECASE),
74
+ re.compile(r'\\paperchecklist\b', re.IGNORECASE),
75
+ ]
76
+
77
+ # Reproducibility Statement (ICLR / NeurIPS).
78
+ _REPRO_SECTION = re.compile(
79
+ r'\\section\*?\s*\{[^}]*Reproducibility[^}]*\}', re.IGNORECASE,
80
+ )
81
+
82
+ # Document-class options carry the paper size.
83
+ _DOCCLASS_RE = re.compile(
84
+ r'\\documentclass\s*(?:\[([^\]]*)\])?\s*\{([^}]+)\}'
85
+ )
86
+
87
+ # A very rough regex for figures/tables INSIDE the Limitations section
88
+ # (used to enforce ACL "discussion only" rule).
89
+ _FLOAT_OR_NEW_SECTION_RE = re.compile(
90
+ r'\\begin\{(?:table|figure|algorithm)\*?\}|\\section\*?\s*\{', re.IGNORECASE,
91
+ )
92
+
93
+
94
+ # ----------------------------------------------------------------- checker ---
95
+
96
+ class TemplateChecker(BaseChecker):
97
+ name = "template"
98
+ display_name = "Conference Template"
99
+ description = "Verify per-venue submission rules (sections, style, anonymity, deliverables)"
100
+
101
+ def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
102
+ config = config or {}
103
+ template = config.get("template")
104
+ if template is None:
105
+ return []
106
+
107
+ content = self._remove_comments(tex_content)
108
+ results: List[CheckResult] = []
109
+
110
+ self._check_mandatory_sections(template, content, results)
111
+ self._check_camera_only_sections(template, content, results)
112
+ self._check_style_package(template, content, results)
113
+ self._check_doc_class(template, content, results)
114
+ self._check_paper_size(template, content, results)
115
+
116
+ if template.double_blind:
117
+ self._check_double_blind_author(template, content, results)
118
+ if template.forbid_identifying_urls:
119
+ self._check_identifying_urls(template, content, results)
120
+ if template.forbid_acks_in_review:
121
+ self._check_acknowledgments(template, content, results)
122
+
123
+ if template.requires_paper_checklist:
124
+ self._check_paper_checklist(template, content, results)
125
+ if template.requires_reproducibility_statement:
126
+ self._check_reproducibility_statement(template, content, results)
127
+ if template.requires_lay_summary_camera:
128
+ self._inform_lay_summary(template, results)
129
+ if template.requires_type1_fonts:
130
+ self._inform_type1_fonts(template, results)
131
+ if template.min_main_pages > 0:
132
+ self._inform_min_pages(template, results)
133
+
134
+ if "Limitations" in template.mandatory_sections:
135
+ self._check_limitations_content(template, content, results)
136
+
137
+ return results
138
+
139
+ # ============================================================== sections ==
140
+
141
+ def _check_mandatory_sections(self, template, content: str, results: List[CheckResult]):
142
+ for section in template.mandatory_sections or []:
143
+ if not _section_pattern(section).search(content):
144
+ results.append(self._create_result(
145
+ passed=False,
146
+ severity=CheckSeverity.ERROR,
147
+ message=f"[{template.name}] Missing mandatory section: '{section}'",
148
+ suggestion=f"Add `\\section{{{section}}}` (required by {template.name}).",
149
+ ))
150
+
151
+ def _check_camera_only_sections(self, template, content: str, results: List[CheckResult]):
152
+ for section in template.mandatory_camera_sections or []:
153
+ if not _section_pattern(section).search(content):
154
+ results.append(self._create_result(
155
+ passed=False,
156
+ severity=CheckSeverity.INFO,
157
+ message=(
158
+ f"[{template.name}] Camera-ready section '{section}' not found. "
159
+ "Required for the camera-ready version, optional for review."
160
+ ),
161
+ suggestion=f"Add `\\section{{{section}}}` before References for camera-ready.",
162
+ ))
163
+
164
+ # =================================================== style / typesetting ==
165
+
166
+ def _check_style_package(self, template, content: str, results: List[CheckResult]):
167
+ pkg = (template.style_package or "").strip()
168
+ if not pkg:
169
+ return
170
+ pkg_re = re.compile(
171
+ r'\\(?:usepackage|documentclass)(?:\[[^\]]*\])?\s*\{\s*'
172
+ + re.escape(pkg) + r'\s*\}'
173
+ )
174
+ if not pkg_re.search(content):
175
+ results.append(self._create_result(
176
+ passed=False,
177
+ severity=CheckSeverity.WARNING,
178
+ message=(
179
+ f"[{template.name}] Style package '{pkg}' not found. "
180
+ "If you really are submitting to this venue, your template may be wrong."
181
+ ),
182
+ suggestion=f"Use the official `{pkg}` style package.",
183
+ ))
184
+
185
+ def _check_doc_class(self, template, content: str, results: List[CheckResult]):
186
+ wanted = (template.doc_class or "").strip()
187
+ if not wanted:
188
+ return
189
+ m = _DOCCLASS_RE.search(content)
190
+ actual = m.group(2).strip() if m else ""
191
+ if actual.lower() != wanted.lower():
192
+ results.append(self._create_result(
193
+ passed=False,
194
+ severity=CheckSeverity.WARNING,
195
+ message=(
196
+ f"[{template.name}] Expected `\\documentclass{{{wanted}}}`, "
197
+ f"found `{actual or 'none'}`."
198
+ ),
199
+ suggestion=f"Use the official document class `{wanted}` (Springer LNCS for ECCV).",
200
+ ))
201
+
202
+ def _check_paper_size(self, template, content: str, results: List[CheckResult]):
203
+ wanted = (template.paper_size or "").lower()
204
+ if wanted not in {"letter", "a4"}:
205
+ return
206
+ m = _DOCCLASS_RE.search(content)
207
+ if not m:
208
+ return
209
+ opts = (m.group(1) or "").lower()
210
+ actual = None
211
+ if "letterpaper" in opts or "letter" in opts:
212
+ actual = "letter"
213
+ elif "a4paper" in opts or "a4" in opts:
214
+ actual = "a4"
215
+ if actual and actual != wanted:
216
+ results.append(self._create_result(
217
+ passed=False,
218
+ severity=CheckSeverity.WARNING,
219
+ message=(
220
+ f"[{template.name}] Expected paper size '{wanted}', "
221
+ f"document class is set to '{actual}'."
222
+ ),
223
+ suggestion=f"Use `\\documentclass[{wanted}paper]{{...}}`.",
224
+ ))
225
+
226
+ # ================================================================ blinding =
227
+
228
+ def _check_double_blind_author(self, template, content: str, results: List[CheckResult]):
229
+ m = re.search(r'\\author\s*(?:\[[^\]]*\])?\s*\{([^}]*)\}', content)
230
+ if not m:
231
+ return
232
+ body = m.group(1)
233
+ if not body.strip():
234
+ return
235
+ if re.search(r'(anonymous|hidden|blind|submission)', body, re.IGNORECASE):
236
+ return
237
+ line_num = self._find_line_number(content, m.start())
238
+ results.append(self._create_result(
239
+ passed=False,
240
+ severity=CheckSeverity.ERROR,
241
+ message=f"[{template.name}] Double-blind: \\author appears to contain identifying info",
242
+ line_number=line_num,
243
+ line_content=body.strip(),
244
+ suggestion=r"Replace \author with anonymous placeholder during review.",
245
+ ))
246
+
247
+ def _check_identifying_urls(self, template, content: str, results: List[CheckResult]):
248
+ for m in _URL_FROM_TEX.finditer(content):
249
+ url = (m.group(1) or m.group(2) or "").strip()
250
+ if not url:
251
+ continue
252
+ if _ANONYMOUS_URL_HINTS.search(url):
253
+ continue
254
+ for pat in _IDENTIFYING_URL_PATTERNS:
255
+ if pat.search(url):
256
+ line_num = self._find_line_number(content, m.start())
257
+ results.append(self._create_result(
258
+ passed=False,
259
+ severity=CheckSeverity.WARNING,
260
+ message=(
261
+ f"[{template.name}] Possible identifying URL during double-blind review: "
262
+ f"{url[:120]}"
263
+ ),
264
+ line_number=line_num,
265
+ line_content=url,
266
+ suggestion=(
267
+ "Use Anonymous GitHub (https://anonymous.4open.science) or remove "
268
+ "the link until the camera-ready version."
269
+ ),
270
+ ))
271
+ break # one finding per URL
272
+
273
+ def _check_acknowledgments(self, template, content: str, results: List[CheckResult]):
274
+ for pat in _ACK_PATTERNS:
275
+ m = pat.search(content)
276
+ if m:
277
+ line_num = self._find_line_number(content, m.start())
278
+ results.append(self._create_result(
279
+ passed=False,
280
+ severity=CheckSeverity.WARNING,
281
+ message=(
282
+ f"[{template.name}] Acknowledgments section detected; "
283
+ f"{template.short_name.upper()} requires omitting it during review."
284
+ ),
285
+ line_number=line_num,
286
+ suggestion=(
287
+ "Comment out or wrap acks in `\\if<reviewmode>...\\fi` so they only "
288
+ "appear in the camera-ready version."
289
+ ),
290
+ ))
291
+ return # one finding is enough
292
+
293
+ # ============================================== per-venue special items ===
294
+
295
+ def _check_paper_checklist(self, template, content: str, results: List[CheckResult]):
296
+ for pat in _NEURIPS_CHECKLIST_PATTERNS:
297
+ if pat.search(content):
298
+ return
299
+ results.append(self._create_result(
300
+ passed=False,
301
+ severity=CheckSeverity.ERROR,
302
+ message=(
303
+ f"[{template.name}] NeurIPS Paper Checklist not found. "
304
+ "NeurIPS desk-rejects submissions without the checklist."
305
+ ),
306
+ suggestion=(
307
+ "Add `\\input{neurips_paper_checklist}` (or paste the official template) "
308
+ "after References / supplementary."
309
+ ),
310
+ ))
311
+
312
+ def _check_reproducibility_statement(self, template, content: str, results: List[CheckResult]):
313
+ if _REPRO_SECTION.search(content):
314
+ return
315
+ results.append(self._create_result(
316
+ passed=False,
317
+ severity=CheckSeverity.INFO,
318
+ message=(
319
+ f"[{template.name}] Reproducibility Statement not found. "
320
+ "It's encouraged (~1 page) and does not count toward the page limit."
321
+ ),
322
+ suggestion=(
323
+ "Add `\\section*{Reproducibility Statement}` before References summarizing "
324
+ "code/data/seeds/hyperparameter availability."
325
+ ),
326
+ ))
327
+
328
+ def _inform_lay_summary(self, template, results: List[CheckResult]):
329
+ results.append(self._create_result(
330
+ passed=False,
331
+ severity=CheckSeverity.INFO,
332
+ message=(
333
+ f"[{template.name}] Lay summary required at camera-ready time "
334
+ "(plain-language summary submitted via OpenReview)."
335
+ ),
336
+ suggestion="Draft a 1–2 paragraph plain-language summary now to avoid a last-minute scramble.",
337
+ ))
338
+
339
+ def _inform_type1_fonts(self, template, results: List[CheckResult]):
340
+ results.append(self._create_result(
341
+ passed=False,
342
+ severity=CheckSeverity.INFO,
343
+ message=(
344
+ f"[{template.name}] Embedded fonts must be Type-1 only — verify with "
345
+ "`pdffonts <paper.pdf>`. Cannot be checked from .tex source alone."
346
+ ),
347
+ suggestion="Compile with `pdflatex` (not XeLaTeX/LuaLaTeX) and convert any Type-3 fonts.",
348
+ ))
349
+
350
+ def _inform_min_pages(self, template, results: List[CheckResult]):
351
+ results.append(self._create_result(
352
+ passed=False,
353
+ severity=CheckSeverity.INFO,
354
+ message=(
355
+ f"[{template.name}] Main text must be at least {template.min_main_pages} pages "
356
+ f"and at most {template.page_limit_review} pages. Cannot be measured from source."
357
+ ),
358
+ suggestion=(
359
+ f"Compile and confirm the rendered PDF stays within "
360
+ f"{template.min_main_pages}–{template.page_limit_review} pages of main text."
361
+ ),
362
+ ))
363
+
364
+ # ============================================ ACL family: Limitations rule
365
+
366
+ def _check_limitations_content(self, template, content: str, results: List[CheckResult]):
367
+ # Find the Limitations section span up to the next \section or end of doc.
368
+ m = re.search(
369
+ r'(\\section\*?\s*(?:\[[^\]]*\])?\s*\{[^}]*Limitations[^}]*\})',
370
+ content, re.IGNORECASE,
371
+ )
372
+ if not m:
373
+ return # mandatory_sections check already flagged absence
374
+ start = m.end()
375
+ nxt = re.search(r'\\section\*?\s*\{', content[start:], re.IGNORECASE)
376
+ end = start + nxt.start() if nxt else len(content)
377
+ section_body = content[start:end]
378
+ # Discussion-only rule: no floats, no nested \section
379
+ if _FLOAT_OR_NEW_SECTION_RE.search(section_body):
380
+ line_num = self._find_line_number(content, start)
381
+ results.append(self._create_result(
382
+ passed=False,
383
+ severity=CheckSeverity.WARNING,
384
+ message=(
385
+ f"[{template.name}] Limitations section appears to contain floats or a "
386
+ "nested section. ACL/EMNLP/NAACL require Limitations to be discussion only."
387
+ ),
388
+ line_number=line_num,
389
+ suggestion=(
390
+ "Move tables/figures/algorithms out of Limitations into the main body or "
391
+ "appendix; Limitations should be prose-only."
392
+ ),
393
+ ))
src/checkers/url_checker.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL liveness checker for bibliography entries.
3
+
4
+ Many @misc / blog / repo references rot over time. This checker does a HEAD
5
+ (falling back to a small GET) on entry.url and flags anything that returns
6
+ 4xx/5xx or fails to connect.
7
+
8
+ Operates on BibEntry objects, not on tex_content. Invoked from main.py / app.py
9
+ when `submission_extra.url_liveness` is true.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import concurrent.futures
14
+ import logging
15
+ from dataclasses import dataclass
16
+ from typing import Iterable, List, Optional
17
+
18
+ import requests
19
+
20
+ from src.utils.http import get_session
21
+ from src.parsers.bib_parser import BibEntry
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class URLFinding:
28
+ entry_key: str
29
+ url: str
30
+ status: str # "ok" | "broken" | "unreachable" | "skipped"
31
+ status_code: Optional[int] = None
32
+ detail: str = ""
33
+
34
+
35
+ class URLChecker:
36
+ """Concurrent HEAD-then-GET liveness check."""
37
+
38
+ SKIP_PREFIXES = ("mailto:", "ftp://", "tel:", "javascript:")
39
+
40
+ def __init__(self, max_workers: int = 8, timeout: float = 15.0):
41
+ self.max_workers = max_workers
42
+ self.timeout = timeout
43
+
44
+ def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
45
+ url = (entry.url or "").strip()
46
+ if not url:
47
+ return None
48
+ if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
49
+ return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
50
+
51
+ session = get_session()
52
+ try:
53
+ r = session.head(url, allow_redirects=True, timeout=self.timeout)
54
+ # Many servers return 405/403 for HEAD but are fine with GET; double-check with a tiny GET.
55
+ if r.status_code in (403, 405, 501):
56
+ r = session.get(url, allow_redirects=True, timeout=self.timeout, stream=True)
57
+ # Don't actually read the body
58
+ r.close()
59
+ except requests.RequestException as e:
60
+ logger.debug("URL check failed for %s: %s", url, e, exc_info=True)
61
+ return URLFinding(entry.key, url, "unreachable", detail=str(e)[:120])
62
+
63
+ if 200 <= r.status_code < 400:
64
+ return URLFinding(entry.key, url, "ok", status_code=r.status_code)
65
+ return URLFinding(
66
+ entry.key, url, "broken",
67
+ status_code=r.status_code,
68
+ detail=f"HTTP {r.status_code}",
69
+ )
70
+
71
+ def check_entries(self, entries: Iterable[BibEntry]) -> List[URLFinding]:
72
+ targets = [e for e in entries if getattr(e, "url", "")]
73
+ if not targets:
74
+ return []
75
+ findings: List[URLFinding] = []
76
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as ex:
77
+ for f in ex.map(self._check_one, targets):
78
+ if f is not None:
79
+ findings.append(f)
80
+ return findings
src/config/__pycache__/__init__.cpython-313.pyc DELETED
Binary file (362 Bytes)
 
src/config/__pycache__/workflow.cpython-313.pyc DELETED
Binary file (7.96 kB)
 
src/config/__pycache__/yaml_config.cpython-313.pyc DELETED
Binary file (12.4 kB)
 
src/config/yaml_config.py CHANGED
@@ -97,11 +97,36 @@ class LLMConfig:
97
  api_key: str = ""
98
 
99
 
100
- @dataclass
101
  class OutputConfig:
102
  """Output configuration."""
103
  quiet: bool = False
104
  minimal_verified: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  @dataclass
@@ -111,9 +136,12 @@ class BibGuardConfig:
111
  template: str = ""
112
  bibliography: BibliographyConfig = field(default_factory=BibliographyConfig)
113
  submission: SubmissionConfig = field(default_factory=SubmissionConfig)
 
114
  workflow: List[WorkflowStep] = field(default_factory=list)
115
  llm: LLMConfig = field(default_factory=LLMConfig)
116
  output: OutputConfig = field(default_factory=OutputConfig)
 
 
117
 
118
  # Internal fields to store discovered files in directory mode
119
  _bib_files: List[Path] = field(default_factory=list)
@@ -225,11 +253,48 @@ def load_config(config_path: str) -> BibGuardConfig:
225
  # Parse output section
226
  if 'output' in data:
227
  out = data['output']
 
 
 
228
  config.output = OutputConfig(
229
  quiet=out.get('quiet', False),
230
- minimal_verified=out.get('minimal_verified', False)
 
231
  )
232
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  return config
234
 
235
 
@@ -264,6 +329,15 @@ files:
264
 
265
  template: ""
266
 
 
 
 
 
 
 
 
 
 
267
  bibliography:
268
  check_metadata: true
269
  check_usage: true
@@ -285,16 +359,27 @@ submission:
285
  citation_quality: true
286
  anonymization: true
287
 
 
 
 
 
 
 
 
 
 
 
288
  llm:
289
- backend: "gemini"
290
- model: ""
291
- api_key: ""
292
 
293
  output:
294
  quiet: false
295
  minimal_verified: false
 
296
  """
297
  with open(output_path, 'w', encoding='utf-8') as f:
298
  f.write(default)
299
-
300
  return output_path
 
97
  api_key: str = ""
98
 
99
 
100
+ @dataclass
101
  class OutputConfig:
102
  """Output configuration."""
103
  quiet: bool = False
104
  minimal_verified: bool = False
105
+ formats: List[str] = field(default_factory=lambda: ["markdown", "html"]) # markdown, html, json
106
+
107
+
108
+ @dataclass
109
+ class NetworkConfig:
110
+ """Network / politeness configuration."""
111
+ contact_email: str = ""
112
+ cache_enabled: bool = True
113
+ cache_ttl_hours: int = 24
114
+ retry_total: int = 5
115
+ retry_backoff_factor: float = 1.5
116
+
117
+
118
+ @dataclass
119
+ class GlossaryConfig:
120
+ """User-supplied project glossary for ConsistencyChecker / AcronymChecker."""
121
+ preferred: List[str] = field(default_factory=list) # e.g. ["Transformer", "fine-tuning"]
122
+ acronyms: Dict[str, str] = field(default_factory=dict) # e.g. {"NLP": "Natural Language Processing"}
123
+
124
+
125
+ @dataclass
126
+ class SubmissionExtraConfig:
127
+ """Extra submission checks added on top of the original list."""
128
+ url_liveness: bool = False
129
+ retraction: bool = True
130
 
131
 
132
  @dataclass
 
136
  template: str = ""
137
  bibliography: BibliographyConfig = field(default_factory=BibliographyConfig)
138
  submission: SubmissionConfig = field(default_factory=SubmissionConfig)
139
+ submission_extra: SubmissionExtraConfig = field(default_factory=SubmissionExtraConfig)
140
  workflow: List[WorkflowStep] = field(default_factory=list)
141
  llm: LLMConfig = field(default_factory=LLMConfig)
142
  output: OutputConfig = field(default_factory=OutputConfig)
143
+ network: NetworkConfig = field(default_factory=NetworkConfig)
144
+ glossary: GlossaryConfig = field(default_factory=GlossaryConfig)
145
 
146
  # Internal fields to store discovered files in directory mode
147
  _bib_files: List[Path] = field(default_factory=list)
 
253
  # Parse output section
254
  if 'output' in data:
255
  out = data['output']
256
+ formats = out.get('formats', ["markdown", "html"])
257
+ if isinstance(formats, str):
258
+ formats = [f.strip() for f in formats.split(",") if f.strip()]
259
  config.output = OutputConfig(
260
  quiet=out.get('quiet', False),
261
+ minimal_verified=out.get('minimal_verified', False),
262
+ formats=list(formats),
263
  )
264
+
265
+ # Parse network section
266
+ if 'network' in data:
267
+ net = data['network'] or {}
268
+ config.network = NetworkConfig(
269
+ contact_email=net.get('contact_email', ''),
270
+ cache_enabled=bool(net.get('cache_enabled', True)),
271
+ cache_ttl_hours=int(net.get('cache_ttl_hours', 24)),
272
+ retry_total=int(net.get('retry_total', 5)),
273
+ retry_backoff_factor=float(net.get('retry_backoff_factor', 1.5)),
274
+ )
275
+
276
+ # Parse glossary section
277
+ if 'glossary' in data:
278
+ g = data['glossary'] or {}
279
+ preferred = g.get('preferred', []) or []
280
+ acronyms = g.get('acronyms', {}) or {}
281
+ if not isinstance(preferred, list):
282
+ preferred = [str(preferred)]
283
+ if not isinstance(acronyms, dict):
284
+ acronyms = {}
285
+ config.glossary = GlossaryConfig(
286
+ preferred=[str(x) for x in preferred],
287
+ acronyms={str(k): str(v) for k, v in acronyms.items()},
288
+ )
289
+
290
+ # Parse submission_extra section (URL liveness, retraction)
291
+ if 'submission_extra' in data:
292
+ sx = data['submission_extra'] or {}
293
+ config.submission_extra = SubmissionExtraConfig(
294
+ url_liveness=bool(sx.get('url_liveness', False)),
295
+ retraction=bool(sx.get('retraction', True)),
296
+ )
297
+
298
  return config
299
 
300
 
 
329
 
330
  template: ""
331
 
332
+ network:
333
+ # Real email used in polite-pool User-Agents (arXiv/CrossRef/OpenAlex).
334
+ # Strongly recommended.
335
+ contact_email: ""
336
+ cache_enabled: true # Local SQLite cache for HTTP responses
337
+ cache_ttl_hours: 24
338
+ retry_total: 5
339
+ retry_backoff_factor: 1.5
340
+
341
  bibliography:
342
  check_metadata: true
343
  check_usage: true
 
359
  citation_quality: true
360
  anonymization: true
361
 
362
+ submission_extra:
363
+ url_liveness: false # HEAD-check every entry.url field (slow, off by default)
364
+ retraction: true # Flag retracted DOIs via CrossRef
365
+
366
+ # Project-specific glossary helps ConsistencyChecker and AcronymChecker
367
+ # avoid false positives and enforce house style.
368
+ glossary:
369
+ preferred: [] # e.g. ["Transformer", "fine-tuning"]
370
+ acronyms: {} # e.g. {NLP: "Natural Language Processing"}
371
+
372
  llm:
373
+ backend: "gemini" # gemini | openai | anthropic | deepseek | ollama | vllm
374
+ model: "" # leave empty for sensible default per backend
375
+ api_key: "" # prefer env var <BACKEND>_API_KEY
376
 
377
  output:
378
  quiet: false
379
  minimal_verified: false
380
+ formats: [markdown, html] # any of: markdown, html, json
381
  """
382
  with open(output_path, 'w', encoding='utf-8') as f:
383
  f.write(default)
384
+
385
  return output_path