thinkwee commited on
Commit ·
fcffa22
1
Parent(s): 79d7264
v2.0
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +95 -11
- README.md +166 -111
- app.py +1109 -793
- app_helper.py +305 -96
- bibguard.yaml +40 -7
- main.py +214 -166
- requirements.txt +1 -0
- scripts/install-hook.sh +53 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/__init__.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/retraction_checker.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/url_validator.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/usage_checker.cpython-313.pyc +0 -0
- src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc +0 -0
- src/analyzers/llm_evaluator.py +229 -81
- src/analyzers/metadata_comparator.py +29 -7
- src/checkers/__init__.py +3 -0
- src/checkers/__pycache__/__init__.cpython-313.pyc +0 -0
- src/checkers/__pycache__/acronym_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/anonymization_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/base.cpython-313.pyc +0 -0
- src/checkers/__pycache__/caption_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/consistency_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/equation_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/formatting_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/number_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/reference_checker.cpython-313.pyc +0 -0
- src/checkers/__pycache__/sentence_checker.cpython-313.pyc +0 -0
- src/checkers/acronym_checker.py +13 -6
- src/checkers/ai_artifacts_checker.py +3 -3
- src/checkers/anonymization_checker.py +3 -3
- src/checkers/base.py +10 -4
- src/checkers/citation_quality_checker.py +1 -1
- src/checkers/consistency_checker.py +26 -6
- src/checkers/formatting_checker.py +6 -41
- src/checkers/retraction_checker.py +53 -0
- src/checkers/sentence_checker.py +1 -1
- src/checkers/template_checker.py +393 -0
- src/checkers/url_checker.py +80 -0
- src/config/__pycache__/__init__.cpython-313.pyc +0 -0
- src/config/__pycache__/workflow.cpython-313.pyc +0 -0
- src/config/__pycache__/yaml_config.cpython-313.pyc +0 -0
- src/config/yaml_config.py +92 -7
.gitignore
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
|
|
| 1 |
# Python
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
*$py.class
|
| 5 |
*.so
|
| 6 |
.Python
|
|
|
|
|
|
|
| 7 |
build/
|
| 8 |
develop-eggs/
|
| 9 |
dist/
|
|
@@ -20,32 +24,96 @@ wheels/
|
|
| 20 |
.installed.cfg
|
| 21 |
*.egg
|
| 22 |
MANIFEST
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
#
|
|
|
|
|
|
|
| 25 |
venv/
|
| 26 |
env/
|
| 27 |
.env
|
|
|
|
|
|
|
| 28 |
.venv/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
#
|
|
|
|
|
|
|
| 31 |
.idea/
|
| 32 |
.vscode/
|
| 33 |
*.swp
|
| 34 |
*.swo
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
#
|
|
|
|
|
|
|
| 37 |
.DS_Store
|
| 38 |
.AppleDouble
|
| 39 |
.LSOverride
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
*_only_used_entry.bib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
#
|
| 48 |
-
#
|
|
|
|
|
|
|
| 49 |
*.tex
|
| 50 |
*.bib
|
| 51 |
*.pdf
|
|
@@ -57,6 +125,22 @@ env/
|
|
| 57 |
*.synctex.gz
|
| 58 |
*.fls
|
| 59 |
*.fdb_latexmk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
#
|
| 62 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
# Python
|
| 3 |
+
# =============================================================================
|
| 4 |
__pycache__/
|
| 5 |
*.py[cod]
|
| 6 |
*$py.class
|
| 7 |
*.so
|
| 8 |
.Python
|
| 9 |
+
|
| 10 |
+
# Distribution / packaging
|
| 11 |
build/
|
| 12 |
develop-eggs/
|
| 13 |
dist/
|
|
|
|
| 24 |
.installed.cfg
|
| 25 |
*.egg
|
| 26 |
MANIFEST
|
| 27 |
+
pip-log.txt
|
| 28 |
+
pip-delete-this-directory.txt
|
| 29 |
|
| 30 |
+
# =============================================================================
|
| 31 |
+
# Virtual environments / dependency managers
|
| 32 |
+
# =============================================================================
|
| 33 |
venv/
|
| 34 |
env/
|
| 35 |
.env
|
| 36 |
+
.env.*
|
| 37 |
+
!.env.example
|
| 38 |
.venv/
|
| 39 |
+
.python-version
|
| 40 |
+
.tool-versions
|
| 41 |
+
|
| 42 |
+
# =============================================================================
|
| 43 |
+
# Test / type / lint caches
|
| 44 |
+
# =============================================================================
|
| 45 |
+
.pytest_cache/
|
| 46 |
+
.cache/
|
| 47 |
+
.coverage
|
| 48 |
+
.coverage.*
|
| 49 |
+
htmlcov/
|
| 50 |
+
coverage.xml
|
| 51 |
+
.tox/
|
| 52 |
+
.nox/
|
| 53 |
+
.mypy_cache/
|
| 54 |
+
.ruff_cache/
|
| 55 |
+
.pyre/
|
| 56 |
+
.pytype/
|
| 57 |
|
| 58 |
+
# =============================================================================
|
| 59 |
+
# IDEs / editors
|
| 60 |
+
# =============================================================================
|
| 61 |
.idea/
|
| 62 |
.vscode/
|
| 63 |
*.swp
|
| 64 |
*.swo
|
| 65 |
+
*~
|
| 66 |
+
*.iml
|
| 67 |
+
.project
|
| 68 |
+
.pydevproject
|
| 69 |
|
| 70 |
+
# =============================================================================
|
| 71 |
+
# OS noise
|
| 72 |
+
# =============================================================================
|
| 73 |
.DS_Store
|
| 74 |
.AppleDouble
|
| 75 |
.LSOverride
|
| 76 |
+
Thumbs.db
|
| 77 |
+
desktop.ini
|
| 78 |
|
| 79 |
+
# =============================================================================
|
| 80 |
+
# Gradio / Hugging Face Spaces
|
| 81 |
+
# =============================================================================
|
| 82 |
+
.gradio/
|
| 83 |
+
gradio_cached_examples/
|
| 84 |
+
flagged/
|
| 85 |
+
|
| 86 |
+
# =============================================================================
|
| 87 |
+
# BibGuard outputs (generated by main.py / app.py)
|
| 88 |
+
# =============================================================================
|
| 89 |
+
bibguard_output/
|
| 90 |
+
*_only_used.bib
|
| 91 |
*_only_used_entry.bib
|
| 92 |
+
bibliography_report.md
|
| 93 |
+
latex_quality_report.md
|
| 94 |
+
line_by_line_report.md
|
| 95 |
+
report.html
|
| 96 |
+
report.json
|
| 97 |
+
# Local HTTP cache used by src/utils/http.py
|
| 98 |
+
.cache/bibguard/
|
| 99 |
+
**/.cache/bibguard/
|
| 100 |
+
|
| 101 |
+
# =============================================================================
|
| 102 |
+
# User secrets / personal config
|
| 103 |
+
# Recommendation: ship `bibguard.example.yaml` and gitignore the real one
|
| 104 |
+
# so API keys / personal paths don't leak. See README for details.
|
| 105 |
+
# =============================================================================
|
| 106 |
+
# bibguard.yaml
|
| 107 |
+
config.yaml
|
| 108 |
+
.bibguard.yaml
|
| 109 |
+
.bibguard.yml
|
| 110 |
+
secrets.yaml
|
| 111 |
+
*.local.yaml
|
| 112 |
|
| 113 |
+
# =============================================================================
|
| 114 |
+
# User paper data (LaTeX / BibTeX sources and build artifacts)
|
| 115 |
+
# Keep README.md, requirements*.txt, and source-tree .md files.
|
| 116 |
+
# =============================================================================
|
| 117 |
*.tex
|
| 118 |
*.bib
|
| 119 |
*.pdf
|
|
|
|
| 125 |
*.synctex.gz
|
| 126 |
*.fls
|
| 127 |
*.fdb_latexmk
|
| 128 |
+
*.toc
|
| 129 |
+
*.lof
|
| 130 |
+
*.lot
|
| 131 |
+
*.nav
|
| 132 |
+
*.snm
|
| 133 |
+
*.vrb
|
| 134 |
|
| 135 |
+
# Markdown / text files: ignore by default to prevent committing user paper
|
| 136 |
+
# content, but keep documentation and project metadata.
|
| 137 |
+
*.txt
|
| 138 |
+
*.md
|
| 139 |
+
!README.md
|
| 140 |
+
!CHANGELOG.md
|
| 141 |
+
!CONTRIBUTING.md
|
| 142 |
+
!LICENSE.md
|
| 143 |
+
!docs/**/*.md
|
| 144 |
+
!requirements.txt
|
| 145 |
+
!requirements-*.txt
|
| 146 |
+
!**/requirements.txt
|
README.md
CHANGED
|
@@ -11,35 +11,46 @@ pinned: false
|
|
| 11 |
|
| 12 |
# BibGuard: Bibliography & LaTeX Quality Auditor
|
| 13 |
|
| 14 |
-
**BibGuard** is
|
| 15 |
|
| 16 |
-
AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and
|
| 17 |
|
| 18 |
## 🛡 Why BibGuard?
|
| 19 |
|
| 20 |
-
-
|
| 21 |
-
-
|
| 22 |
-
-
|
| 23 |
-
-
|
| 24 |
-
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
## 🚀 Features
|
| 27 |
|
| 28 |
### Bibliography Validation
|
| 29 |
-
-
|
| 30 |
-
-
|
| 31 |
-
-
|
| 32 |
-
-
|
| 33 |
-
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
### LaTeX Quality Checks
|
| 36 |
-
-
|
| 37 |
-
-
|
| 38 |
-
-
|
| 39 |
-
-
|
| 40 |
-
-
|
| 41 |
-
-
|
| 42 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
## 📦 Installation
|
| 45 |
|
|
@@ -57,10 +68,9 @@ pip install -r requirements.txt
|
|
| 57 |
python main.py --init
|
| 58 |
```
|
| 59 |
|
| 60 |
-
This creates `config.yaml`. Edit it to
|
| 61 |
|
| 62 |
-
####
|
| 63 |
-
Best for individual papers.
|
| 64 |
```yaml
|
| 65 |
files:
|
| 66 |
bib: "paper.bib"
|
|
@@ -68,141 +78,186 @@ files:
|
|
| 68 |
output_dir: "bibguard_output"
|
| 69 |
```
|
| 70 |
|
| 71 |
-
####
|
| 72 |
-
|
| 73 |
```yaml
|
| 74 |
files:
|
| 75 |
input_dir: "./my_project_dir"
|
| 76 |
output_dir: "bibguard_output"
|
| 77 |
```
|
| 78 |
|
| 79 |
-
### 2. Run
|
| 80 |
|
| 81 |
```bash
|
| 82 |
-
python main.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
```
|
| 84 |
|
| 85 |
-
**
|
| 86 |
-
- `
|
| 87 |
-
- `
|
| 88 |
-
- `
|
| 89 |
-
- `
|
|
|
|
| 90 |
|
| 91 |
## 🛠 Configuration
|
| 92 |
|
| 93 |
-
|
| 94 |
|
| 95 |
```yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
bibliography:
|
| 97 |
-
check_metadata: true #
|
| 98 |
-
check_usage: true #
|
| 99 |
-
check_duplicates: true
|
| 100 |
-
check_preprint_ratio: true #
|
| 101 |
check_relevance: false # LLM-based relevance check (requires API key)
|
| 102 |
|
| 103 |
-
|
| 104 |
-
#
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
```
|
| 121 |
|
| 122 |
-
## 🤖 LLM-Based Relevance
|
| 123 |
|
| 124 |
-
|
| 125 |
|
| 126 |
-
``
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
```
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
- **
|
| 138 |
-
- **
|
| 139 |
-
- **
|
| 140 |
-
|
| 141 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
Then run:
|
| 144 |
```bash
|
| 145 |
-
|
|
|
|
| 146 |
```
|
| 147 |
|
|
|
|
|
|
|
| 148 |
## 📝 Understanding Reports
|
| 149 |
|
| 150 |
-
###
|
| 151 |
-
|
| 152 |
-
-
|
| 153 |
-
-
|
| 154 |
-
-
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
###
|
| 157 |
-
|
| 158 |
-
-
|
| 159 |
-
-
|
| 160 |
-
- 🔵 **Suggestions**: Style improvements (e.g., weak sentence starters)
|
| 161 |
|
| 162 |
-
###
|
| 163 |
-
|
| 164 |
|
| 165 |
## 🧐 Understanding Mismatches
|
| 166 |
|
| 167 |
BibGuard is strict, but false positives happen:
|
| 168 |
|
| 169 |
-
1.
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
2. **Author List Variations**:
|
| 174 |
-
- *Reason*: Different databases handle large author lists differently
|
| 175 |
-
- *Action*: Check if primary authors match
|
| 176 |
-
|
| 177 |
-
3. **Venue Name Differences**:
|
| 178 |
-
- *Reason*: Abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems")
|
| 179 |
-
- *Action*: Both are usually correct
|
| 180 |
|
| 181 |
-
|
| 182 |
-
- *Reason*: Blogs, documentation not indexed by academic databases
|
| 183 |
-
- *Action*: Manually verify URL and title
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
``
|
| 188 |
-
|
| 189 |
-
python main.py --list-templates # List conference templates
|
| 190 |
-
python main.py --config my.yaml # Use custom config file
|
| 191 |
-
```
|
| 192 |
|
| 193 |
## 🤝 Contributing
|
| 194 |
|
| 195 |
-
Contributions welcome
|
| 196 |
|
| 197 |
## 🙏 Acknowledgments
|
| 198 |
|
| 199 |
-
BibGuard uses
|
| 200 |
-
- arXiv API
|
| 201 |
-
- CrossRef API
|
| 202 |
-
- Semantic Scholar API
|
| 203 |
-
- DBLP API
|
| 204 |
-
- OpenAlex API
|
| 205 |
-
- Google Scholar (via
|
| 206 |
|
| 207 |
---
|
| 208 |
|
|
|
|
| 11 |
|
| 12 |
# BibGuard: Bibliography & LaTeX Quality Auditor
|
| 13 |
|
| 14 |
+
**BibGuard** is a comprehensive quality-assurance tool for academic papers. It validates every bibliography entry against real-world databases, checks LaTeX submission quality, flags retracted DOIs and broken URLs, and uses an LLM (optional) to verify that cited papers actually support your claims.
|
| 15 |
|
| 16 |
+
AI coding assistants and writing tools often hallucinate plausible-sounding but non-existent references. **BibGuard** verifies the existence of every entry against multiple databases (arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, Google Scholar) and produces a single, beautiful, self-contained HTML report you can open offline.
|
| 17 |
|
| 18 |
## 🛡 Why BibGuard?
|
| 19 |
|
| 20 |
+
- **🚫 Stop Hallucinations**: Instantly flag citations that don't exist or have mismatched metadata
|
| 21 |
+
- **🚫 Catch Retractions**: Detect references to papers that have been retracted or are under "expression of concern"
|
| 22 |
+
- **🔗 Detect Broken URLs**: HEAD-check `entry.url` to find dead links before reviewers do
|
| 23 |
+
- **📋 LaTeX Quality Checks**: Detect formatting issues, weak writing patterns, double-blind compliance, AI-text artifacts
|
| 24 |
+
- **🔒 Safe & Non-Destructive**: Your original files are **never modified** — only reports are generated
|
| 25 |
+
- **🧠 Contextual Relevance** *(optional, with LLM)*: Score each citation 1-5 and tag its role (baseline/method/dataset/counterexample/survey/motivation/other)
|
| 26 |
+
- **⚡ Re-runs are fast**: SQLite-backed HTTP cache + auto-retry mean the second run on the same paper completes in seconds
|
| 27 |
|
| 28 |
## 🚀 Features
|
| 29 |
|
| 30 |
### Bibliography Validation
|
| 31 |
+
- **🔍 Multi-Source Verification**: Validates metadata against arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar
|
| 32 |
+
- **🚫 Retraction Detection**: Flags retracted/withdrawn DOIs via CrossRef's `update-to` relation
|
| 33 |
+
- **🔗 URL Liveness Check**: Optional HEAD-then-GET check on every `entry.url`
|
| 34 |
+
- **📊 Preprint Detection**: Warns if >50% of references are preprints, and suggests published versions when arXiv records them
|
| 35 |
+
- **👀 Usage Analysis**: Highlights missing citations and unused bib entries
|
| 36 |
+
- **👯 Duplicate Detection**: Identifies duplicate entries with fuzzy matching
|
| 37 |
+
- **🤖 AI Relevance + Role Tagging** *(optional)*: 1-5 relevance score plus citation role classification
|
| 38 |
|
| 39 |
### LaTeX Quality Checks
|
| 40 |
+
- **📐 Format Validation**: Caption placement, cross-references, citation spacing, equation punctuation
|
| 41 |
+
- **✍️ Writing Quality**: Weak sentence starters, hedging language, redundant phrases
|
| 42 |
+
- **🔤 Consistency**: Spelling variants (US/UK English), hyphenation, terminology — augmentable via project glossary
|
| 43 |
+
- **🤖 AI Artifact Detection**: Conversational AI responses, placeholder text, Markdown remnants
|
| 44 |
+
- **🔠 Acronym Validation**: Ensures acronyms are defined before use, with a project-glossary skip list
|
| 45 |
+
- **🎭 Anonymization**: Checks for identity leaks in double-blind submissions
|
| 46 |
+
- **📅 Citation Age**: Flags references older than 30 years
|
| 47 |
+
- **🎓 Conference Templates**: Mandatory-section and style-package checks for ACL, EMNLP, NAACL, CVPR, ICCV, ECCV, NeurIPS, ICML, ICLR
|
| 48 |
+
|
| 49 |
+
### Outputs
|
| 50 |
+
- 📄 **Markdown reports** — bibliography validation + LaTeX quality issues
|
| 51 |
+
- 🌐 **Self-contained HTML** — dark mode, full-text search, per-section severity filters, inline highlighting of the offending span on each LaTeX issue. Opens offline, no server required
|
| 52 |
+
- 🤖 **JSON** for CI / scripts / custom dashboards
|
| 53 |
+
- 🧹 **Cleaned `.bib`** containing only entries actually cited in the paper
|
| 54 |
|
| 55 |
## 📦 Installation
|
| 56 |
|
|
|
|
| 68 |
python main.py --init
|
| 69 |
```
|
| 70 |
|
| 71 |
+
This creates `config.yaml`. Edit it to point at your `.bib` and `.tex` files.
|
| 72 |
|
| 73 |
+
#### Single File Mode
|
|
|
|
| 74 |
```yaml
|
| 75 |
files:
|
| 76 |
bib: "paper.bib"
|
|
|
|
| 78 |
output_dir: "bibguard_output"
|
| 79 |
```
|
| 80 |
|
| 81 |
+
#### Directory Scan Mode
|
| 82 |
+
For projects with multiple `.tex` and `.bib` files:
|
| 83 |
```yaml
|
| 84 |
files:
|
| 85 |
input_dir: "./my_project_dir"
|
| 86 |
output_dir: "bibguard_output"
|
| 87 |
```
|
| 88 |
|
| 89 |
+
### 2. Run a Check
|
| 90 |
|
| 91 |
```bash
|
| 92 |
+
python main.py # full check using config.yaml / bibguard.yaml
|
| 93 |
+
python main.py --quick # local-only checks (no network, instant)
|
| 94 |
+
python main.py --format json,html # pick output formats
|
| 95 |
+
python main.py --verbose # DEBUG logs to stderr
|
| 96 |
+
python main.py --config my.yaml # custom config path
|
| 97 |
+
python main.py --list-templates # list conference templates
|
| 98 |
```
|
| 99 |
|
| 100 |
+
**Default outputs** (in `bibguard_output/`):
|
| 101 |
+
- `report.html` — single self-contained HTML, opens offline, dark-mode aware
|
| 102 |
+
- `report.json` — full machine-readable dump (only when `json` is in `output.formats`)
|
| 103 |
+
- `bibliography_report.md` — bibliography validation, with corroboration notes
|
| 104 |
+
- `latex_quality_report.md` — LaTeX quality issues, errors / warnings / suggestions, full line content with the offending span bolded
|
| 105 |
+
- `<bibname>_only_used.bib` — clean bibliography of cited entries only
|
| 106 |
|
| 107 |
## 🛠 Configuration
|
| 108 |
|
| 109 |
+
`bibguard.yaml` (or `config.yaml`) contains the following sections:
|
| 110 |
|
| 111 |
```yaml
|
| 112 |
+
files:
|
| 113 |
+
bib: "paper.bib"
|
| 114 |
+
tex: "paper.tex"
|
| 115 |
+
output_dir: "bibguard_output"
|
| 116 |
+
|
| 117 |
+
network:
|
| 118 |
+
contact_email: "" # used in polite-pool User-Agent for arXiv/CrossRef/OpenAlex
|
| 119 |
+
cache_enabled: true # local SQLite cache for HTTP responses (~/.cache/bibguard)
|
| 120 |
+
cache_ttl_hours: 24
|
| 121 |
+
retry_total: 5 # auto-retry on 429/5xx with exponential backoff
|
| 122 |
+
retry_backoff_factor: 1.5
|
| 123 |
+
|
| 124 |
+
template: "" # acl | emnlp | naacl | cvpr | iccv | eccv | neurips | icml | iclr
|
| 125 |
+
|
| 126 |
bibliography:
|
| 127 |
+
check_metadata: true # verify against online databases (slow on first run, fast on repeats)
|
| 128 |
+
check_usage: true # find unused entries / missing citations
|
| 129 |
+
check_duplicates: true
|
| 130 |
+
check_preprint_ratio: true # warn if >50% of references are preprints
|
| 131 |
check_relevance: false # LLM-based relevance check (requires API key)
|
| 132 |
|
| 133 |
+
submission_extra:
|
| 134 |
+
url_liveness: false # HEAD-check every entry.url field (slow)
|
| 135 |
+
retraction: true # flag retracted DOIs via CrossRef
|
| 136 |
+
|
| 137 |
+
submission: # 11 LaTeX checkers — toggle each independently
|
| 138 |
+
caption: true
|
| 139 |
+
reference: true
|
| 140 |
+
formatting: true
|
| 141 |
+
equation: true
|
| 142 |
+
ai_artifacts: true
|
| 143 |
+
sentence: true
|
| 144 |
+
consistency: true
|
| 145 |
+
acronym: true
|
| 146 |
+
number: true
|
| 147 |
+
citation_quality: true
|
| 148 |
+
anonymization: true
|
| 149 |
+
|
| 150 |
+
# Project glossary feeds the consistency / acronym checkers.
|
| 151 |
+
glossary:
|
| 152 |
+
preferred:
|
| 153 |
+
- "Transformer"
|
| 154 |
+
- "fine-tuning"
|
| 155 |
+
acronyms:
|
| 156 |
+
NLP: "Natural Language Processing"
|
| 157 |
+
LLM: "Large Language Model"
|
| 158 |
+
|
| 159 |
+
llm:
|
| 160 |
+
backend: "gemini" # gemini | openai | anthropic | deepseek | ollama | vllm
|
| 161 |
+
model: "" # leave empty for sensible default per backend
|
| 162 |
+
api_key: "" # PREFER env var: $GEMINI_API_KEY / $OPENAI_API_KEY / etc.
|
| 163 |
+
|
| 164 |
+
output:
|
| 165 |
+
quiet: false
|
| 166 |
+
minimal_verified: false
|
| 167 |
+
formats: [markdown, html] # any of: markdown, html, json
|
| 168 |
```
|
| 169 |
|
| 170 |
+
## 🤖 LLM-Based Relevance + Role Tagging
|
| 171 |
|
| 172 |
+
When `bibliography.check_relevance` is `true`, BibGuard sends each citation's surrounding context plus the cited paper's abstract to your chosen LLM. The model returns a 1-5 relevance score, an `is_relevant` boolean, a one-sentence explanation, and a **citation role**:
|
| 173 |
|
| 174 |
+
- `baseline` — cited as a comparison/baseline
|
| 175 |
+
- `method` — cited paper introduces a method this one builds on
|
| 176 |
+
- `dataset` — provides a dataset/benchmark used here
|
| 177 |
+
- `counterexample` — cited to argue against
|
| 178 |
+
- `survey` — cited as a survey/overview
|
| 179 |
+
- `motivation` — cited to motivate the problem
|
| 180 |
+
- `other`
|
| 181 |
|
| 182 |
+
**Supported backends**: Gemini, OpenAI, Anthropic, DeepSeek, Ollama (local), vLLM (custom endpoint).
|
| 183 |
+
|
| 184 |
+
**API keys**: read from environment variables by convention — `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `DEEPSEEK_API_KEY`. Set them in your shell rather than committing `api_key:` to `bibguard.yaml`.
|
| 185 |
+
|
| 186 |
+
## 🌐 Web UI
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
python app.py
|
| 190 |
```
|
| 191 |
|
| 192 |
+
Opens at `http://localhost:7860`. The web UI mirrors the CLI but with a streaming status panel and three presets:
|
| 193 |
+
|
| 194 |
+
- **Quick** — local checks only, no network, instant
|
| 195 |
+
- **Standard** — local + retraction lookup (CrossRef)
|
| 196 |
+
- **Strict** — adds multi-source metadata fetch + URL liveness (slow on first run; subsequent runs are cached)
|
| 197 |
+
|
| 198 |
+
The toolbar fits in one row: file uploads, preset chips, and Run / Stop. Per-check overrides live in the **Advanced** accordion. The report renders inline as a self-contained iframe so the page stays stable while entries stream in. Downloads (HTML, Markdown bib, JSON, cleaned `.bib`, `bibguard.log`) appear in the **Downloads** accordion below.
|
| 199 |
+
|
| 200 |
+
Set `BIBGUARD_CONTACT_EMAIL=you@example.com` in your shell to use a real contact in the polite-pool User-Agent.
|
| 201 |
+
|
| 202 |
+
## 🪝 Pre-commit Hook
|
| 203 |
+
|
| 204 |
+
To run BibGuard automatically before each commit that touches `.tex` or `.bib`:
|
| 205 |
|
|
|
|
| 206 |
```bash
|
| 207 |
+
cd /path/to/your-paper-repo
|
| 208 |
+
bash /path/to/BibGuard/scripts/install-hook.sh
|
| 209 |
```
|
| 210 |
|
| 211 |
+
Skip the hook for one commit with `git commit --no-verify`.
|
| 212 |
+
|
| 213 |
## 📝 Understanding Reports
|
| 214 |
|
| 215 |
+
### Self-Contained HTML (`report.html`)
|
| 216 |
+
The recommended output. Single file, no external assets, dark-mode aware. Includes:
|
| 217 |
+
- Three tabs: **Bibliography** · **LaTeX Quality** · **Retractions / URLs**
|
| 218 |
+
- **Per-section filter chips** — bibliography filters by Verified / Unverified / Unused; LaTeX quality filters by Errors / Warnings / Info
|
| 219 |
+
- **Full-text search** across titles, authors, keys, and messages — works inside the active tab
|
| 220 |
+
- **Inline span highlighting** — for LaTeX issues that come from a regex (e.g., `\cite{}` without `~`), the offending substring is wrapped in `<mark>` so you can see exactly *where* in the line to look
|
| 221 |
+
- **Honest empty states** — Retractions / URL liveness panels report how many entries actually carried a `doi=` / `url=` field, so an empty result no longer looks like the check failed silently
|
| 222 |
+
- Theme toggle that overrides system preference
|
| 223 |
|
| 224 |
+
### Markdown Reports
|
| 225 |
+
Two files for granular review and code review tooling:
|
| 226 |
+
- `bibliography_report.md` — every entry with metadata-match status, including positive **corroboration notes** when a second source agreed
|
| 227 |
+
- `latex_quality_report.md` — issues grouped by checker and severity, full line content with the offending span bolded
|
|
|
|
| 228 |
|
| 229 |
+
### JSON Output
|
| 230 |
+
Machine-readable dump for CI integration. Top-level keys: `meta`, `summary`, `entries`, `submission_results`, `retractions`, `url_findings`, `duplicates`, `missing_citations`.
|
| 231 |
|
| 232 |
## 🧐 Understanding Mismatches
|
| 233 |
|
| 234 |
BibGuard is strict, but false positives happen:
|
| 235 |
|
| 236 |
+
1. **Year Discrepancy (±1 Year)** — preprint vs. official publication. Verify which version you intend to cite.
|
| 237 |
+
2. **Author List Variations** — different databases truncate large author lists differently. Check primary authors.
|
| 238 |
+
3. **Venue Name Differences** — abbreviations vs. full names (e.g., "NeurIPS" vs. "Neural Information Processing Systems"). Both usually correct.
|
| 239 |
+
4. **Non-Academic Sources** — blogs and documentation aren't indexed by academic databases. Verify URL and title manually.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
## 🔧 Performance Notes
|
|
|
|
|
|
|
| 242 |
|
| 243 |
+
- **First run** with `check_metadata: true` on ~100 entries: 1-3 minutes (rate-limited by arXiv/CrossRef).
|
| 244 |
+
- **Re-runs**: seconds, thanks to the SQLite HTTP cache at `~/.cache/bibguard/http_cache.sqlite` (TTL 24h by default).
|
| 245 |
+
- **Quick mode** (`python main.py --quick`) bypasses all network calls; runs in <1 second on most papers.
|
| 246 |
+
- **Retraction lookup** is concurrent; ~5-10 seconds for 100 entries with cache cold.
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
## 🤝 Contributing
|
| 249 |
|
| 250 |
+
Contributions welcome. Open an issue or pull request.
|
| 251 |
|
| 252 |
## 🙏 Acknowledgments
|
| 253 |
|
| 254 |
+
BibGuard uses the following data sources:
|
| 255 |
+
- [arXiv API](https://info.arxiv.org/help/api/index.html)
|
| 256 |
+
- [CrossRef REST API](https://api.crossref.org)
|
| 257 |
+
- [Semantic Scholar Graph API](https://api.semanticscholar.org)
|
| 258 |
+
- [DBLP API](https://dblp.org/faq/How+to+use+the+dblp+search+API.html)
|
| 259 |
+
- [OpenAlex API](https://docs.openalex.org)
|
| 260 |
+
- Google Scholar (via scraping; rate-limited)
|
| 261 |
|
| 262 |
---
|
| 263 |
|
app.py
CHANGED
|
@@ -1,927 +1,1243 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
BibGuard Gradio
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import tempfile
|
| 9 |
-
import
|
| 10 |
from pathlib import Path
|
| 11 |
-
|
| 12 |
-
import
|
| 13 |
|
| 14 |
from src.parsers import BibParser, TexParser
|
| 15 |
-
from src.fetchers import
|
|
|
|
|
|
|
|
|
|
| 16 |
from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
|
| 17 |
from src.report.generator import ReportGenerator, EntryReport
|
| 18 |
-
from src.config.yaml_config import
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
from src.checkers import CHECKER_REGISTRY
|
| 21 |
-
from src.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
from app_helper import fetch_and_compare_with_workflow
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
# Custom CSS for better Markdown rendering
|
| 26 |
CUSTOM_CSS = """
|
| 27 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
|
| 28 |
|
| 29 |
-
* {
|
| 30 |
-
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 31 |
-
}
|
| 32 |
-
"""
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
Ensure your academic paper is flawless. Upload your <code>.bib</code> and <code>.tex</code> files on the left and click <strong>"Check Now"</strong>.
|
| 43 |
-
</p>
|
| 44 |
-
|
| 45 |
-
<div style="display: grid; gap: 20px; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));">
|
| 46 |
-
<div style="background: #fefce8; padding: 16px; border-radius: 8px; border: 1px solid #fde047;">
|
| 47 |
-
<strong style="color: #854d0e; display: block; margin-bottom: 8px;">⚠️ Metadata Check Defaults</strong>
|
| 48 |
-
"🔍 Metadata" is <strong>disabled by default</strong>. It verifies your entries against ArXiv/DBLP/Crossref but takes time (1-3 mins) to fetch data. Enable it if you want strict verification.
|
| 49 |
-
</div>
|
| 50 |
-
|
| 51 |
-
<div style="background: #eff6ff; padding: 16px; border-radius: 8px; border: 1px solid #bfdbfe;">
|
| 52 |
-
<strong style="color: #1e40af; display: block; margin-bottom: 8px;">🚀 Go Pro with Local Version</strong>
|
| 53 |
-
LLM-based context relevance checking (is this citation actually relevant?) is excluded here. Clone the <a href="https://github.com/thinkwee/BibGuard" target="_blank" style="color: #2563eb; text-decoration: underline; font-weight: 600;">GitHub repo</a> to use the full power with your API key.
|
| 54 |
-
</div>
|
| 55 |
-
</div>
|
| 56 |
-
|
| 57 |
-
<h4 style="margin: 24px 0 12px 0; color: #111827; font-size: 1.1em;">📊 Understanding Your Reports</h4>
|
| 58 |
-
<div style="display: grid; gap: 12px;">
|
| 59 |
-
<div style="display: flex; gap: 12px; align-items: baseline;">
|
| 60 |
-
<span style="background: #e0e7ff; color: #3730a3; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📚 Bibliography</span>
|
| 61 |
-
<span>Validates metadata fields, detects duplicates, and checks citation counts.</span>
|
| 62 |
-
</div>
|
| 63 |
-
<div style="display: flex; gap: 12px; align-items: baseline;">
|
| 64 |
-
<span style="background: #dcfce7; color: #166534; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📝 LaTeX Quality</span>
|
| 65 |
-
<span>Syntax check, caption validation, acronym consistency, and style suggestions.</span>
|
| 66 |
-
</div>
|
| 67 |
-
<div style="display: flex; gap: 12px; align-items: baseline;">
|
| 68 |
-
<span style="background: #f3f4f6; color: #4b5563; padding: 2px 8px; border-radius: 4px; font-size: 0.9em; font-weight: 600; white-space: nowrap;">📋 Line-by-Line</span>
|
| 69 |
-
<span>Maps every issue found directly to the line number in your source file.</span>
|
| 70 |
-
</div>
|
| 71 |
-
</div>
|
| 72 |
-
</div>
|
| 73 |
-
</div>
|
| 74 |
-
</div>
|
| 75 |
-
"""
|
| 76 |
-
|
| 77 |
-
CUSTOM_CSS += """
|
| 78 |
-
/* Global Reset */
|
| 79 |
-
body, gradio-app {
|
| 80 |
-
overflow: hidden !important; /* Prevent double scrollbars on the page */
|
| 81 |
-
}
|
| 82 |
|
| 83 |
.gradio-container {
|
| 84 |
-
max-width:
|
|
|
|
|
|
|
|
|
|
| 85 |
width: 100% !important;
|
| 86 |
-
|
| 87 |
-
padding: 0 !important;
|
| 88 |
-
margin: 0 !important;
|
| 89 |
}
|
| 90 |
|
| 91 |
-
/* Header
|
| 92 |
-
.
|
| 93 |
-
padding:
|
| 94 |
-
background: white;
|
| 95 |
border-bottom: 1px solid #e5e7eb;
|
|
|
|
| 96 |
}
|
| 97 |
|
| 98 |
-
/*
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
| 105 |
}
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
}
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
| 126 |
}
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
border-radius: 12px;
|
| 132 |
-
padding: 24px;
|
| 133 |
-
margin-bottom: 16px; /* Spacing between cards */
|
| 134 |
-
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
| 135 |
-
border: 1px solid #e5e7eb;
|
| 136 |
-
transition: transform 0.2s, box-shadow 0.2s;
|
| 137 |
}
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
}
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
| 152 |
}
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
color: #111827;
|
| 158 |
-
margin: 0 0 4px 0;
|
| 159 |
}
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
}
|
| 166 |
-
|
| 167 |
-
.
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
}
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
font-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
}
|
| 182 |
-
|
| 183 |
-
.
|
| 184 |
-
.
|
| 185 |
-
.
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
/* Stats Grid */
|
| 190 |
-
.stats-container {
|
| 191 |
-
display: grid;
|
| 192 |
-
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
| 193 |
-
gap: 16px;
|
| 194 |
-
margin-bottom: 24px;
|
| 195 |
}
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
color:
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
}
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
}
|
| 216 |
|
| 217 |
-
.
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
display: flex;
|
| 229 |
-
flex-direction: column;
|
| 230 |
-
|
| 231 |
-
/* Height constraint to prevent one huge card from stretching the row */
|
| 232 |
-
max-height: 100px;
|
| 233 |
-
overflow-y: auto;
|
| 234 |
}
|
| 235 |
|
| 236 |
-
/*
|
| 237 |
-
.
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
}
|
| 240 |
-
.
|
| 241 |
-
|
| 242 |
-
|
|
|
|
| 243 |
}
|
| 244 |
-
|
| 245 |
-
.
|
| 246 |
-
font-size:
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
margin
|
| 251 |
-
|
| 252 |
-
top: 0;
|
| 253 |
-
background: #f9fafb; /* Maintain bg on scroll */
|
| 254 |
-
z-index: 1;
|
| 255 |
}
|
|
|
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
transition: all 0.2s;
|
| 266 |
}
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
}
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
display: flex;
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
}
|
| 281 |
-
|
| 282 |
-
.card-title {
|
| 283 |
-
font-size: 1.1em;
|
| 284 |
font-weight: 600;
|
| 285 |
-
color: #
|
| 286 |
-
margin: 0;
|
| 287 |
-
}
|
| 288 |
-
|
| 289 |
-
.card-subtitle {
|
| 290 |
-
font-size: 0.9em;
|
| 291 |
-
color: #6b7280;
|
| 292 |
-
margin-top: 4px;
|
| 293 |
-
}
|
| 294 |
-
|
| 295 |
-
/* Status Badges */
|
| 296 |
-
.badge {
|
| 297 |
display: inline-flex;
|
| 298 |
align-items: center;
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
font-weight: 500;
|
| 303 |
}
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
/* Content Styling */
|
| 312 |
-
.card-content {
|
| 313 |
-
font-size: 15px;
|
| 314 |
-
color: #374151;
|
| 315 |
-
line-height: 1.6;
|
| 316 |
}
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
padding: 2px 6px;
|
| 321 |
border-radius: 4px;
|
| 322 |
-
font-
|
| 323 |
-
|
| 324 |
-
color: #c2410c;
|
| 325 |
}
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
display:
|
| 330 |
-
|
| 331 |
gap: 12px;
|
| 332 |
-
|
|
|
|
| 333 |
}
|
| 334 |
-
|
| 335 |
-
.
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
}
|
|
|
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
}
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
.
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
}
|
| 360 |
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
color: white;
|
| 364 |
-
padding: 20px;
|
| 365 |
-
border-radius: 12px;
|
| 366 |
text-align: center;
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
}
|
| 369 |
-
|
| 370 |
-
.
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
.
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
}
|
|
|
|
| 380 |
|
| 381 |
-
/* Button styling */
|
| 382 |
-
.primary-btn {
|
| 383 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 384 |
-
border: none !important;
|
| 385 |
-
font-weight: 600 !important;
|
| 386 |
-
}
|
| 387 |
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
"""
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
def create_config_from_ui(
|
| 397 |
-
check_metadata
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
check_preprint_ratio: bool,
|
| 401 |
-
caption: bool,
|
| 402 |
-
reference: bool,
|
| 403 |
-
formatting: bool,
|
| 404 |
-
equation: bool,
|
| 405 |
-
ai_artifacts: bool,
|
| 406 |
-
sentence: bool,
|
| 407 |
-
consistency: bool,
|
| 408 |
-
acronym: bool,
|
| 409 |
-
number: bool,
|
| 410 |
-
citation_quality: bool,
|
| 411 |
-
anonymization: bool
|
| 412 |
) -> BibGuardConfig:
|
| 413 |
-
"""Create a BibGuardConfig from UI settings."""
|
| 414 |
config = BibGuardConfig()
|
| 415 |
-
|
| 416 |
config.bibliography = BibliographyConfig(
|
| 417 |
check_metadata=check_metadata,
|
| 418 |
check_usage=check_usage,
|
| 419 |
check_duplicates=check_duplicates,
|
| 420 |
check_preprint_ratio=check_preprint_ratio,
|
| 421 |
-
check_relevance=False #
|
| 422 |
)
|
| 423 |
-
|
| 424 |
config.submission = SubmissionConfig(
|
| 425 |
-
caption=caption,
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
ai_artifacts=ai_artifacts,
|
| 430 |
-
sentence=sentence,
|
| 431 |
-
consistency=consistency,
|
| 432 |
-
acronym=acronym,
|
| 433 |
-
number=number,
|
| 434 |
-
citation_quality=citation_quality,
|
| 435 |
-
anonymization=anonymization
|
| 436 |
)
|
| 437 |
-
|
| 438 |
config.output = OutputConfig(quiet=True, minimal_verified=False)
|
| 439 |
-
|
| 440 |
return config
|
| 441 |
|
| 442 |
|
| 443 |
-
def
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
html.append(f'<div class="stat-card"><div class="stat-value">{total}</div><div class="stat-label">Total Entries</div></div>')
|
| 454 |
-
html.append(f'<div class="stat-card"><div class="stat-value">{verified}</div><div class="stat-label">Verified</div></div>')
|
| 455 |
-
html.append(f'<div class="stat-card"><div class="stat-value">{used}</div><div class="stat-label">Used in Text</div></div>')
|
| 456 |
-
html.append('</div>')
|
| 457 |
-
|
| 458 |
-
# 2. Entries
|
| 459 |
-
for report in report_gen.entries:
|
| 460 |
-
entry = report.entry
|
| 461 |
-
status_badges = []
|
| 462 |
-
|
| 463 |
-
# Metadata Status
|
| 464 |
-
if report.comparison:
|
| 465 |
-
if report.comparison.is_match:
|
| 466 |
-
status_badges.append('<span class="badge badge-success">✓ Verified</span>')
|
| 467 |
-
if report.comparison.source:
|
| 468 |
-
status_badges.append(f'<span class="badge badge-info">{report.comparison.source.upper()}</span>')
|
| 469 |
-
else:
|
| 470 |
-
status_badges.append('<span class="badge badge-error">⚠ Metadata Mismatch</span>')
|
| 471 |
-
else:
|
| 472 |
-
status_badges.append('<span class="badge badge-neutral">No Metadata Check</span>')
|
| 473 |
-
|
| 474 |
-
# Usage Status
|
| 475 |
-
if report.usage:
|
| 476 |
-
if report.usage.is_used:
|
| 477 |
-
status_badges.append(f'<span class="badge badge-success">Used: {report.usage.usage_count}x</span>')
|
| 478 |
-
else:
|
| 479 |
-
status_badges.append('<span class="badge badge-warning">Unused</span>')
|
| 480 |
-
|
| 481 |
-
# Build Card
|
| 482 |
-
html.append(f'''
|
| 483 |
-
<div class="report-card">
|
| 484 |
-
<div class="card-header">
|
| 485 |
-
<div>
|
| 486 |
-
<h3 class="card-title">{entry.title or "No Title"}</h3>
|
| 487 |
-
<div class="card-subtitle">{entry.key} • {entry.year} • {entry.entry_type}</div>
|
| 488 |
-
</div>
|
| 489 |
-
<div style="display: flex; gap: 8px;">
|
| 490 |
-
{" ".join(status_badges)}
|
| 491 |
-
</div>
|
| 492 |
-
</div>
|
| 493 |
-
|
| 494 |
-
<div class="card-content">
|
| 495 |
-
<div class="detail-grid">
|
| 496 |
-
{
|
| 497 |
-
(lambda e: "".join([
|
| 498 |
-
f'<div class="detail-item"><div class="detail-label">{k}</div><div class="detail-value">{v}</div></div>'
|
| 499 |
-
for k, v in filter(None, [
|
| 500 |
-
("Authors", e.author or "N/A"),
|
| 501 |
-
("Venue", e.journal or e.booktitle or e.publisher or "N/A"),
|
| 502 |
-
("DOI", e.doi) if e.doi else None,
|
| 503 |
-
("ArXiv", e.arxiv_id) if e.arxiv_id and not e.doi else None,
|
| 504 |
-
("Volume/Pages", f"{'Vol.'+e.volume if e.volume else ''} {'pp.'+e.pages if e.pages else ''}".strip()) if e.volume or e.pages else None,
|
| 505 |
-
("URL", f'<a href="{e.url}" target="_blank" style="text-decoration:underline;">Link</a>') if e.url else None
|
| 506 |
-
])
|
| 507 |
-
]))(entry)
|
| 508 |
-
}
|
| 509 |
-
</div>
|
| 510 |
-
''')
|
| 511 |
-
|
| 512 |
-
# Add issues if any
|
| 513 |
-
issues = []
|
| 514 |
-
if report.comparison and not report.comparison.is_match:
|
| 515 |
-
# Add main message derived from match status
|
| 516 |
-
if report.comparison.issues:
|
| 517 |
-
for issue in report.comparison.issues:
|
| 518 |
-
issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• {issue}</div>')
|
| 519 |
-
else:
|
| 520 |
-
issues.append(f'<div style="margin-left: 20px; font-size: 0.9em; color: #b91c1c;">• Verification failed</div>')
|
| 521 |
-
|
| 522 |
-
if issues:
|
| 523 |
-
html.append('<div style="margin-top: 16px; padding-top: 12px; border-top: 1px solid #eee;">')
|
| 524 |
-
html.append("".join(issues))
|
| 525 |
-
html.append('</div>')
|
| 526 |
-
|
| 527 |
-
html.append('</div></div>') # Close card-content and report-card
|
| 528 |
-
|
| 529 |
-
html.append('</div>') # Close container
|
| 530 |
-
return "".join(html)
|
| 531 |
-
|
| 532 |
-
def generate_latex_html(results: list) -> str:
|
| 533 |
-
"""Generate HTML for LaTeX quality check."""
|
| 534 |
-
from src.checkers import CheckSeverity
|
| 535 |
-
|
| 536 |
-
html = ['<div class="scrollable-report-area">']
|
| 537 |
-
|
| 538 |
-
# Stats
|
| 539 |
-
errors = sum(1 for r in results if r.severity == CheckSeverity.ERROR)
|
| 540 |
-
warnings = sum(1 for r in results if r.severity == CheckSeverity.WARNING)
|
| 541 |
-
infos = sum(1 for r in results if r.severity == CheckSeverity.INFO)
|
| 542 |
-
|
| 543 |
-
html.append('<div class="stats-container">')
|
| 544 |
-
html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #ef4444 0%, #b91c1c 100%);"><div class="stat-value">{errors}</div><div class="stat-label">Errors</div></div>')
|
| 545 |
-
html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%);"><div class="stat-value">{warnings}</div><div class="stat-label">Warnings</div></div>')
|
| 546 |
-
html.append(f'<div class="stat-card" style="background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);"><div class="stat-value">{infos}</div><div class="stat-label">Suggestions</div></div>')
|
| 547 |
-
html.append('</div>')
|
| 548 |
-
|
| 549 |
-
if not results:
|
| 550 |
-
html.append('<div class="report-card"><div class="card-content" style="text-align: center; padding: 40px; color: #166534; font-size: 1.2em;">✅ No issues found in LaTeX code!</div></div>')
|
| 551 |
-
else:
|
| 552 |
-
# Group by Checker
|
| 553 |
-
results.sort(key=lambda x: x.checker_name)
|
| 554 |
-
current_checker = None
|
| 555 |
-
|
| 556 |
-
for result in results:
|
| 557 |
-
badge_class = "badge-neutral"
|
| 558 |
-
if result.severity == CheckSeverity.ERROR: badge_class = "badge-error"
|
| 559 |
-
elif result.severity == CheckSeverity.WARNING: badge_class = "badge-warning"
|
| 560 |
-
elif result.severity == CheckSeverity.INFO: badge_class = "badge-info"
|
| 561 |
-
|
| 562 |
-
html.append(f'''
|
| 563 |
-
<div class="report-card">
|
| 564 |
-
<div class="card-header">
|
| 565 |
-
<div>
|
| 566 |
-
<h3 class="card-title">{result.checker_name}</h3>
|
| 567 |
-
<div class="card-subtitle">Line {result.line_number}</div>
|
| 568 |
-
</div>
|
| 569 |
-
<span class="badge {badge_class}">{result.severity.name}</span>
|
| 570 |
-
</div>
|
| 571 |
-
<div class="card-content">
|
| 572 |
-
{result.message}
|
| 573 |
-
{f'<div style="margin-top: 8px; background: #f3f4f6; padding: 8px; border-radius: 4px; font-family: monospace;">{result.line_content}</div>' if result.line_content else ''}
|
| 574 |
-
{f'<div style="margin-top: 8px; color: #166534;">💡 Suggestion: {result.suggestion}</div>' if result.suggestion else ''}
|
| 575 |
-
</div>
|
| 576 |
-
</div>
|
| 577 |
-
''')
|
| 578 |
-
|
| 579 |
-
html.append('</div>')
|
| 580 |
-
return "".join(html)
|
| 581 |
-
|
| 582 |
-
def generate_line_html(content: str, results: list) -> str:
|
| 583 |
-
"""Generate HTML for Line-by-Line report."""
|
| 584 |
-
# Build a dictionary of line_number -> list of issues
|
| 585 |
-
issues_by_line = {}
|
| 586 |
-
for r in results:
|
| 587 |
-
if r.line_number not in issues_by_line:
|
| 588 |
-
issues_by_line[r.line_number] = []
|
| 589 |
-
issues_by_line[r.line_number].append(r)
|
| 590 |
-
|
| 591 |
-
lines = content.split('\n')
|
| 592 |
-
|
| 593 |
-
html = ['<div class="scrollable-report-area">']
|
| 594 |
-
|
| 595 |
-
html.append('<div class="report-card"><div class="card-content">Issues are mapped to specific lines below.</div></div>')
|
| 596 |
-
|
| 597 |
-
for i, line in enumerate(lines, 1):
|
| 598 |
-
if i in issues_by_line:
|
| 599 |
-
# Highlight this line
|
| 600 |
-
line_issues = issues_by_line[i]
|
| 601 |
-
|
| 602 |
-
html.append(f'''
|
| 603 |
-
<div class="report-card" style="border-left: 4px solid #ef4444; padding: 12px;">
|
| 604 |
-
<div style="font-family: monospace; color: #6b7280; font-size: 0.9em; margin-bottom: 4px;">Line {i}</div>
|
| 605 |
-
<div style="font-family: monospace; background: #fee2e2; padding: 4px; border-radius: 4px; overflow-x: auto; white-space: pre;">{line}</div>
|
| 606 |
-
<div style="margin-top: 8px;">
|
| 607 |
-
''')
|
| 608 |
-
|
| 609 |
-
for issue in line_issues:
|
| 610 |
-
html.append(f'<div style="color: #991b1b; font-size: 0.95em; margin-top: 4px;">• {issue.message}</div>')
|
| 611 |
-
|
| 612 |
-
html.append('</div></div>')
|
| 613 |
-
|
| 614 |
-
html.append('</div>')
|
| 615 |
-
return "".join(html)
|
| 616 |
|
| 617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
def run_check(
|
| 621 |
-
bib_file,
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
"⚠️ Please upload both `.bib` and `.tex` files."
|
| 647 |
)
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
)
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
# Read tex content for checkers
|
| 662 |
-
tex_content = Path(tex_path).read_text(encoding='utf-8', errors='replace')
|
| 663 |
-
|
| 664 |
-
# Parse files
|
| 665 |
-
bib_parser = BibParser()
|
| 666 |
-
entries = bib_parser.parse_file(bib_path)
|
| 667 |
-
|
| 668 |
-
tex_parser = TexParser()
|
| 669 |
-
tex_parser.parse_file(tex_path)
|
| 670 |
-
|
| 671 |
-
bib_config = config.bibliography
|
| 672 |
-
|
| 673 |
-
# Initialize components
|
| 674 |
-
arxiv_fetcher = None
|
| 675 |
-
crossref_fetcher = None
|
| 676 |
-
semantic_scholar_fetcher = None
|
| 677 |
-
openalex_fetcher = None
|
| 678 |
-
dblp_fetcher = None
|
| 679 |
-
comparator = None
|
| 680 |
-
usage_checker = None
|
| 681 |
-
duplicate_detector = None
|
| 682 |
-
|
| 683 |
-
if bib_config.check_metadata:
|
| 684 |
-
arxiv_fetcher = ArxivFetcher()
|
| 685 |
-
semantic_scholar_fetcher = SemanticScholarFetcher()
|
| 686 |
-
openalex_fetcher = OpenAlexFetcher()
|
| 687 |
-
dblp_fetcher = DBLPFetcher()
|
| 688 |
-
crossref_fetcher = CrossRefFetcher()
|
| 689 |
-
comparator = MetadataComparator()
|
| 690 |
-
|
| 691 |
-
if bib_config.check_usage:
|
| 692 |
-
usage_checker = UsageChecker(tex_parser)
|
| 693 |
-
|
| 694 |
-
if bib_config.check_duplicates:
|
| 695 |
-
duplicate_detector = DuplicateDetector()
|
| 696 |
-
|
| 697 |
-
# Initialize report generator
|
| 698 |
-
report_gen = ReportGenerator(
|
| 699 |
-
minimal_verified=False,
|
| 700 |
-
check_preprint_ratio=bib_config.check_preprint_ratio,
|
| 701 |
-
preprint_warning_threshold=bib_config.preprint_warning_threshold
|
| 702 |
)
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
results = checker.check(tex_content, {})
|
| 714 |
for r in results:
|
| 715 |
-
r.file_path =
|
| 716 |
submission_results.extend(results)
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
report_gen.set_duplicate_groups(
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
report_gen.set_missing_citations(
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
if usage_checker:
|
| 743 |
usage_result = usage_checker.check_usage(entry)
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
if bib_config.check_metadata and comparator:
|
| 748 |
comparison_result = fetch_and_compare_with_workflow(
|
| 749 |
entry, workflow_config, arxiv_fetcher, crossref_fetcher,
|
| 750 |
-
|
| 751 |
)
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
)
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 788 |
|
| 789 |
-
def create_app():
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
# Load icon as base64
|
| 793 |
-
icon_html = ""
|
| 794 |
try:
|
| 795 |
-
icon_path = Path("assets/icon-192.png"
|
| 796 |
if icon_path.exists():
|
| 797 |
with open(icon_path, "rb") as f:
|
| 798 |
-
|
| 799 |
-
icon_html =
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
</
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
)
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
|
|
|
|
|
|
| 834 |
)
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
)
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
fn=run_check,
|
| 906 |
inputs=[
|
| 907 |
-
|
| 908 |
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 909 |
caption, reference, formatting, equation, ai_artifacts,
|
| 910 |
-
sentence, consistency, acronym, number, citation_quality, anonymization
|
|
|
|
| 911 |
],
|
| 912 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
)
|
| 914 |
-
|
| 915 |
return app
|
| 916 |
|
| 917 |
|
| 918 |
-
# Create the app
|
| 919 |
app = create_app()
|
| 920 |
|
|
|
|
| 921 |
if __name__ == "__main__":
|
|
|
|
| 922 |
app.launch(
|
| 923 |
-
favicon_path=
|
| 924 |
show_error=True,
|
| 925 |
css=CUSTOM_CSS,
|
| 926 |
-
theme=gr.themes.Soft()
|
| 927 |
)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
BibGuard Gradio web app — minimalist iframe layout.
|
| 4 |
|
| 5 |
+
The right pane embeds the self-contained ``report.html`` produced by
|
| 6 |
+
``src/report/html_report.py`` via ``<iframe srcdoc=...>``. This makes the
|
| 7 |
+
generated report the single source of truth (per-section filters, full-text
|
| 8 |
+
search, dark mode, inline span highlighting all live inside it) and avoids
|
| 9 |
+
re-rendering the same content inside Gradio with stale styles.
|
| 10 |
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import base64
|
| 14 |
+
import logging
|
| 15 |
+
import os
|
| 16 |
import tempfile
|
| 17 |
+
import time
|
| 18 |
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
import gradio as gr
|
| 21 |
|
| 22 |
from src.parsers import BibParser, TexParser
|
| 23 |
+
from src.fetchers import (
|
| 24 |
+
ArxivFetcher, CrossRefFetcher, SemanticScholarFetcher,
|
| 25 |
+
OpenAlexFetcher, DBLPFetcher,
|
| 26 |
+
)
|
| 27 |
from src.analyzers import MetadataComparator, UsageChecker, DuplicateDetector
|
| 28 |
from src.report.generator import ReportGenerator, EntryReport
|
| 29 |
+
from src.config.yaml_config import (
|
| 30 |
+
BibGuardConfig, BibliographyConfig, SubmissionConfig, OutputConfig,
|
| 31 |
+
)
|
| 32 |
+
from src.config.workflow import get_default_workflow
|
| 33 |
from src.checkers import CHECKER_REGISTRY
|
| 34 |
+
from src.checkers.retraction_checker import RetractionChecker
|
| 35 |
+
from src.checkers.url_checker import URLChecker
|
| 36 |
+
from src.utils import http as http_layer
|
| 37 |
+
from src.utils.logging_setup import setup as setup_logging, capture_run
|
| 38 |
+
from src.utils.validation import validate_bib, validate_tex, format_report
|
| 39 |
from app_helper import fetch_and_compare_with_workflow
|
| 40 |
|
| 41 |
+
LOG_PATH = setup_logging(os.environ.get("BIBGUARD_LOG", "WARNING"))
|
| 42 |
+
logger = logging.getLogger("bibguard.app")
|
| 43 |
+
logger.info("BibGuard app starting (log file: %s)", LOG_PATH)
|
| 44 |
+
|
| 45 |
+
# Configure HTTP layer once at import time.
|
| 46 |
+
http_layer.configure(
|
| 47 |
+
contact_email=os.environ.get("BIBGUARD_CONTACT_EMAIL", ""),
|
| 48 |
+
cache_enabled=True,
|
| 49 |
+
cache_ttl_hours=24,
|
| 50 |
+
retry_total=5,
|
| 51 |
+
retry_backoff_factor=1.5,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# --------------------------------------------------------------------- presets
|
| 56 |
+
|
| 57 |
+
PRESETS = {
|
| 58 |
+
"Quick": {
|
| 59 |
+
"check_metadata": False, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
|
| 60 |
+
"url_liveness": False, "retraction": False,
|
| 61 |
+
"submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
|
| 62 |
+
"ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
|
| 63 |
+
"number": True, "citation_quality": True, "anonymization": True},
|
| 64 |
+
},
|
| 65 |
+
"Standard": {
|
| 66 |
+
"check_metadata": False, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
|
| 67 |
+
"url_liveness": False, "retraction": True,
|
| 68 |
+
"submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
|
| 69 |
+
"ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
|
| 70 |
+
"number": True, "citation_quality": True, "anonymization": True},
|
| 71 |
+
},
|
| 72 |
+
"Strict": {
|
| 73 |
+
"check_metadata": True, "check_duplicates": True, "check_usage": True, "check_preprint_ratio": True,
|
| 74 |
+
"url_liveness": True, "retraction": True,
|
| 75 |
+
"submission": {"caption": True, "reference": True, "formatting": True, "equation": True,
|
| 76 |
+
"ai_artifacts": True, "sentence": True, "consistency": True, "acronym": True,
|
| 77 |
+
"number": True, "citation_quality": True, "anonymization": True},
|
| 78 |
+
},
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ----------------------------------------------------------------------- CSS
|
| 83 |
|
|
|
|
| 84 |
CUSTOM_CSS = """
|
| 85 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
|
| 86 |
|
| 87 |
+
* { font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; }
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
/* Reserve space for the vertical scrollbar so expanding the Advanced
|
| 90 |
+
accordion (or anything else that adds content) doesn't shift the
|
| 91 |
+
layout horizontally. `overflow-y: scroll` on html is the universal
|
| 92 |
+
fallback for browsers without scrollbar-gutter.
|
| 93 |
+
`overflow-x: hidden` on body kills any page-width jitter coming from
|
| 94 |
+
inner elements that briefly overflow during streaming updates. */
|
| 95 |
+
html { scrollbar-gutter: stable; overflow-y: scroll; overflow-x: hidden; }
|
| 96 |
+
body { overflow-x: hidden; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
.gradio-container {
|
| 99 |
+
max-width: 1400px !important;
|
| 100 |
+
margin: 0 auto !important;
|
| 101 |
+
padding: 0 20px !important;
|
| 102 |
+
box-sizing: border-box !important;
|
| 103 |
width: 100% !important;
|
| 104 |
+
overflow-x: hidden !important;
|
|
|
|
|
|
|
| 105 |
}
|
| 106 |
|
| 107 |
+
/* Header strip */
|
| 108 |
+
.bg-header {
|
| 109 |
+
padding: 14px 4px 12px !important;
|
|
|
|
| 110 |
border-bottom: 1px solid #e5e7eb;
|
| 111 |
+
margin-bottom: 14px;
|
| 112 |
}
|
| 113 |
|
| 114 |
+
/* ==================================================================
|
| 115 |
+
Top toolbar — single horizontal row with all primary controls.
|
| 116 |
+
Every primary control has the SAME explicit 56px height. The little
|
| 117 |
+
filename/info chip beneath sits in a fixed 18px slot. The columns
|
| 118 |
+
wrap that into a 78px tall toolbar that's identical across cells.
|
| 119 |
+
================================================================== */
|
| 120 |
+
.bg-toolbar {
|
| 121 |
+
margin-bottom: 14px;
|
| 122 |
+
gap: 10px !important;
|
| 123 |
+
align-items: flex-start !important;
|
| 124 |
}
|
| 125 |
+
.bg-toolbar .gr-form { gap: 0 !important; }
|
| 126 |
+
.bg-toolbar .gr-block { border: none !important; box-shadow: none !important; padding: 0 !important; }
|
| 127 |
+
|
| 128 |
+
/* Common: any direct primary control fills column width */
|
| 129 |
+
.bg-toolbar > * { width: 100% !important; }
|
| 130 |
+
|
| 131 |
+
/* ---- Upload buttons ---- */
|
| 132 |
+
.bg-upload-btn,
|
| 133 |
+
.bg-upload-btn > .wrap,
|
| 134 |
+
.bg-upload-btn > div {
|
| 135 |
+
height: 56px !important;
|
| 136 |
+
min-height: 56px !important;
|
| 137 |
+
max-height: 56px !important;
|
| 138 |
+
width: 100% !important;
|
| 139 |
}
|
| 140 |
+
.bg-upload-btn button {
|
| 141 |
+
height: 56px !important;
|
| 142 |
+
min-height: 56px !important;
|
| 143 |
+
max-height: 56px !important;
|
| 144 |
+
width: 100% !important;
|
| 145 |
+
padding: 0 14px !important;
|
| 146 |
+
font-size: 13px !important;
|
| 147 |
+
font-weight: 500 !important;
|
| 148 |
+
border-radius: 8px !important;
|
| 149 |
+
border: 1px dashed #cbd5e1 !important;
|
| 150 |
+
background: #f8fafc !important;
|
| 151 |
+
color: #334155 !important;
|
| 152 |
+
transition: border 0.15s, background 0.15s !important;
|
| 153 |
+
line-height: 1 !important;
|
| 154 |
}
|
| 155 |
+
.bg-upload-btn button:hover {
|
| 156 |
+
border-color: #2563eb !important;
|
| 157 |
+
background: #eff6ff !important;
|
| 158 |
+
color: #1e3a8a !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
}
|
| 160 |
|
| 161 |
+
/* ---- Run / Stop button (same column, visibility-swapped) ---- */
|
| 162 |
+
.bg-run-btn,
|
| 163 |
+
.bg-run-btn > .wrap,
|
| 164 |
+
.bg-run-btn > div {
|
| 165 |
+
height: 56px !important;
|
| 166 |
+
min-height: 56px !important;
|
| 167 |
+
max-height: 56px !important;
|
| 168 |
+
width: 100% !important;
|
| 169 |
}
|
| 170 |
+
.bg-run-btn button {
|
| 171 |
+
height: 56px !important;
|
| 172 |
+
min-height: 56px !important;
|
| 173 |
+
max-height: 56px !important;
|
| 174 |
+
width: 100% !important;
|
| 175 |
+
font-weight: 600 !important;
|
| 176 |
+
border-radius: 8px !important;
|
| 177 |
+
font-size: 14px !important;
|
| 178 |
+
line-height: 1 !important;
|
| 179 |
+
padding: 0 16px !important;
|
| 180 |
}
|
| 181 |
+
.bg-stop-btn button {
|
| 182 |
+
background: #dc2626 !important;
|
| 183 |
+
color: white !important;
|
| 184 |
+
border: none !important;
|
|
|
|
|
|
|
| 185 |
}
|
| 186 |
+
.bg-stop-btn button:hover { background: #b91c1c !important; }
|
| 187 |
+
|
| 188 |
+
/* ---- Preset radio as horizontal pill chips ---- */
|
| 189 |
+
.bg-preset,
|
| 190 |
+
.bg-preset > div,
|
| 191 |
+
.bg-preset > .wrap {
|
| 192 |
+
height: 56px !important;
|
| 193 |
+
min-height: 56px !important;
|
| 194 |
+
max-height: 56px !important;
|
| 195 |
+
padding: 0 !important;
|
| 196 |
}
|
| 197 |
+
.bg-preset > label,
|
| 198 |
+
.bg-preset .label-wrap { display: none !important; }
|
| 199 |
+
.bg-preset .wrap,
|
| 200 |
+
.bg-preset > div > div,
|
| 201 |
+
.bg-preset fieldset {
|
| 202 |
+
display: flex !important;
|
| 203 |
+
flex-direction: row !important;
|
| 204 |
+
gap: 4px !important;
|
| 205 |
+
flex-wrap: nowrap !important;
|
| 206 |
+
width: 100% !important;
|
| 207 |
+
height: 56px !important;
|
| 208 |
+
align-items: stretch !important;
|
| 209 |
+
border: none !important;
|
| 210 |
+
padding: 0 !important;
|
| 211 |
+
margin: 0 !important;
|
| 212 |
}
|
| 213 |
+
.bg-preset label {
|
| 214 |
+
flex: 1 1 0 !important;
|
| 215 |
+
margin: 0 !important;
|
| 216 |
+
padding: 0 8px !important;
|
| 217 |
+
height: 56px !important;
|
| 218 |
+
min-height: 56px !important;
|
| 219 |
+
max-height: 56px !important;
|
| 220 |
+
border-radius: 8px !important;
|
| 221 |
+
font-size: 13px !important;
|
| 222 |
+
font-weight: 500 !important;
|
| 223 |
+
border: 1px solid #e5e7eb !important;
|
| 224 |
+
background: #ffffff !important;
|
| 225 |
+
cursor: pointer !important;
|
| 226 |
+
text-align: center !important;
|
| 227 |
+
display: inline-flex !important;
|
| 228 |
+
align-items: center !important;
|
| 229 |
+
justify-content: center !important;
|
| 230 |
+
line-height: 1 !important;
|
| 231 |
+
color: #475569 !important;
|
| 232 |
+
transition: background 0.15s, border 0.15s !important;
|
| 233 |
+
white-space: nowrap !important;
|
| 234 |
}
|
| 235 |
+
.bg-preset label:hover { background: #f8fafc !important; border-color: #cbd5e1 !important; }
|
| 236 |
+
.bg-preset input[type="radio"] { display: none !important; }
|
| 237 |
+
.bg-preset label.selected,
|
| 238 |
+
.bg-preset label:has(input:checked) {
|
| 239 |
+
background: #1e3a8a !important;
|
| 240 |
+
color: #ffffff !important;
|
| 241 |
+
border-color: #1e3a8a !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
}
|
| 243 |
|
| 244 |
+
/* ---- Caption chip beneath each toolbar control ---- */
|
| 245 |
+
.bg-fname {
|
| 246 |
+
font-size: 11.5px;
|
| 247 |
+
color: #94a3b8;
|
| 248 |
+
padding: 4px 8px 0 8px;
|
| 249 |
+
line-height: 1.3;
|
| 250 |
+
overflow: hidden;
|
| 251 |
+
text-overflow: ellipsis;
|
| 252 |
+
white-space: nowrap;
|
| 253 |
+
height: 18px;
|
| 254 |
+
box-sizing: content-box;
|
| 255 |
}
|
| 256 |
+
.bg-fname.ok { color: #166534; font-weight: 500; }
|
| 257 |
+
|
| 258 |
+
/* ==================================================================
|
| 259 |
+
Advanced settings — gr.Row with each Checkbox as its own card.
|
| 260 |
+
Trick: `display: contents` on Gradio's intermediate wrapper makes
|
| 261 |
+
it vanish from the layout tree, so the actual checkbox blocks
|
| 262 |
+
become direct flex children of .bg-row. Card style is applied to
|
| 263 |
+
each block, not the wrapper, so we get N cards per row instead of
|
| 264 |
+
one big box.
|
| 265 |
+
================================================================== */
|
| 266 |
+
.bg-row {
|
| 267 |
+
display: flex !important;
|
| 268 |
+
flex-direction: row !important;
|
| 269 |
+
gap: 10px !important;
|
| 270 |
+
align-items: stretch !important;
|
| 271 |
+
padding: 4px 0 !important;
|
| 272 |
}
|
| 273 |
|
| 274 |
+
/* Flatten Gradio's intermediate `.form` / `.gr-form` wrapper so its
|
| 275 |
+
children become direct flex items of .bg-row. */
|
| 276 |
+
.bg-row > .form,
|
| 277 |
+
.bg-row > .gr-form {
|
| 278 |
+
display: contents !important;
|
| 279 |
+
}
|
| 280 |
+
/* Some Gradio versions emit a plain `<div>` wrapper instead of `.form`.
|
| 281 |
+
We can't safely `display: contents` every direct div (the spacer is
|
| 282 |
+
one), but if the wrapper has only blocks inside, contents flatten it. */
|
| 283 |
+
.bg-row > div:not(.bg-row-spacer):not(.gr-block):not(.block) {
|
| 284 |
+
display: contents !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
}
|
| 286 |
|
| 287 |
+
/* Each individual checkbox block = a card */
|
| 288 |
+
.bg-row .gr-block,
|
| 289 |
+
.bg-row .block {
|
| 290 |
+
flex: 1 1 0 !important;
|
| 291 |
+
min-width: 0 !important;
|
| 292 |
+
background: #f8fafc !important;
|
| 293 |
+
border: 1px solid #e5e7eb !important;
|
| 294 |
+
border-radius: 8px !important;
|
| 295 |
+
padding: 8px 12px !important;
|
| 296 |
+
box-shadow: none !important;
|
| 297 |
+
transition: background 0.15s, border 0.15s !important;
|
| 298 |
}
|
| 299 |
+
.bg-row .gr-block:hover,
|
| 300 |
+
.bg-row .block:hover {
|
| 301 |
+
background: #eff6ff !important;
|
| 302 |
+
border-color: #cbd5e1 !important;
|
| 303 |
}
|
| 304 |
+
.bg-row label,
|
| 305 |
+
.bg-row .gr-checkbox label {
|
| 306 |
+
font-size: 13px !important;
|
| 307 |
+
font-weight: 500 !important;
|
| 308 |
+
line-height: 1.3 !important;
|
| 309 |
+
color: #334155 !important;
|
| 310 |
+
margin: 0 !important;
|
| 311 |
+
padding: 0 !important;
|
|
|
|
|
|
|
|
|
|
| 312 |
}
|
| 313 |
+
.bg-row .gr-info, .bg-row [class*="info"] { display: none !important; }
|
| 314 |
|
| 315 |
+
/* Spacer — invisible flex item that just preserves alignment */
|
| 316 |
+
.bg-row .bg-row-spacer {
|
| 317 |
+
flex: 1 1 0 !important;
|
| 318 |
+
background: transparent !important;
|
| 319 |
+
border: none !important;
|
| 320 |
+
box-shadow: none !important;
|
| 321 |
+
padding: 0 !important;
|
| 322 |
+
visibility: hidden !important;
|
|
|
|
| 323 |
}
|
| 324 |
|
| 325 |
+
/* ==================================================================
|
| 326 |
+
Status strip — thin one-liner above the report.
|
| 327 |
+
The Gradio HTML wrapper itself is pinned to its parent column's width
|
| 328 |
+
so no inner content can change the page geometry during streaming.
|
| 329 |
+
================================================================== */
|
| 330 |
+
#bg-status-wrap,
|
| 331 |
+
#bg-status-wrap > * {
|
| 332 |
+
width: 100% !important;
|
| 333 |
+
max-width: 100% !important;
|
| 334 |
+
min-width: 0 !important;
|
| 335 |
+
box-sizing: border-box !important;
|
| 336 |
+
overflow-x: hidden !important;
|
| 337 |
}
|
| 338 |
+
.bg-status {
|
| 339 |
+
padding: 10px 14px;
|
| 340 |
+
border-radius: 10px;
|
| 341 |
+
background: #f8fafc;
|
| 342 |
+
border: 1px solid #e2e8f0;
|
| 343 |
+
font-size: 12.5px;
|
| 344 |
+
line-height: 1.45;
|
| 345 |
+
color: #334155;
|
| 346 |
+
margin: 8px 0 12px 0;
|
| 347 |
+
max-width: 100%;
|
| 348 |
+
overflow: hidden; /* never let inline content widen the page */
|
| 349 |
+
box-sizing: border-box;
|
| 350 |
+
}
|
| 351 |
+
.bg-status-row {
|
| 352 |
display: flex;
|
| 353 |
+
align-items: center;
|
| 354 |
+
gap: 14px;
|
| 355 |
+
flex-wrap: nowrap; /* one row, ellipsize the middle */
|
| 356 |
+
min-width: 0;
|
| 357 |
+
width: 100%;
|
| 358 |
}
|
| 359 |
+
.bg-status .bg-status-stage {
|
|
|
|
|
|
|
| 360 |
font-weight: 600;
|
| 361 |
+
color: #1e3a8a;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
display: inline-flex;
|
| 363 |
align-items: center;
|
| 364 |
+
gap: 8px;
|
| 365 |
+
flex-shrink: 0;
|
| 366 |
+
white-space: nowrap;
|
|
|
|
| 367 |
}
|
| 368 |
+
.bg-status .bg-status-detail {
|
| 369 |
+
color: #475569;
|
| 370 |
+
flex: 1 1 0;
|
| 371 |
+
min-width: 0;
|
| 372 |
+
overflow: hidden;
|
| 373 |
+
text-overflow: ellipsis;
|
| 374 |
+
white-space: nowrap;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
}
|
| 376 |
+
.bg-status .bg-status-detail code {
|
| 377 |
+
background: #eef2ff;
|
| 378 |
+
padding: 1px 6px;
|
|
|
|
| 379 |
border-radius: 4px;
|
| 380 |
+
font-size: 11.5px;
|
| 381 |
+
color: #1e3a8a;
|
|
|
|
| 382 |
}
|
| 383 |
+
.bg-status .bg-status-meta {
|
| 384 |
+
color: #64748b;
|
| 385 |
+
font-size: 11.5px;
|
| 386 |
+
display: inline-flex;
|
| 387 |
+
flex-wrap: nowrap;
|
| 388 |
gap: 12px;
|
| 389 |
+
flex-shrink: 0;
|
| 390 |
+
white-space: nowrap;
|
| 391 |
}
|
| 392 |
+
.bg-status.done { background: #f0fdf4; border-color: #bbf7d0; }
|
| 393 |
+
.bg-status.done .bg-status-stage { color: #166534; }
|
| 394 |
+
.bg-status.error { background: #fef2f2; border-color: #fecaca; }
|
| 395 |
+
.bg-status.error .bg-status-stage { color: #b91c1c; }
|
| 396 |
+
.bg-status .spin {
|
| 397 |
+
display: inline-block;
|
| 398 |
+
width: 10px; height: 10px;
|
| 399 |
+
border: 2px solid #cbd5e1;
|
| 400 |
+
border-top-color: #2563eb;
|
| 401 |
+
border-radius: 50%;
|
| 402 |
+
animation: bg-spin 0.9s linear infinite;
|
| 403 |
}
|
| 404 |
+
@keyframes bg-spin { to { transform: rotate(360deg); } }
|
| 405 |
|
| 406 |
+
/* ==================================================================
|
| 407 |
+
Report area — full-width iframe.
|
| 408 |
+
================================================================== */
|
| 409 |
+
.bg-main { padding: 0 !important; }
|
| 410 |
+
.bg-report-iframe {
|
| 411 |
+
width: 100%;
|
| 412 |
+
height: 80vh;
|
| 413 |
+
min-height: 620px;
|
| 414 |
+
border: 1px solid #e5e7eb;
|
| 415 |
+
border-radius: 12px;
|
| 416 |
+
background: white;
|
| 417 |
+
box-shadow: 0 1px 2px rgba(0,0,0,0.04);
|
| 418 |
}
|
| 419 |
|
| 420 |
+
/* Empty / error placeholder (full-width, centered card) */
|
| 421 |
+
.bg-empty {
|
| 422 |
+
display: flex; align-items: center; justify-content: center;
|
| 423 |
+
flex-direction: column; gap: 14px;
|
| 424 |
+
min-height: 60vh;
|
| 425 |
+
color: #6b7280; text-align: center;
|
| 426 |
+
border: 2px dashed #e5e7eb; border-radius: 12px;
|
| 427 |
+
padding: 56px 24px;
|
| 428 |
+
background: #fafafa;
|
| 429 |
}
|
| 430 |
+
.bg-empty .bg-empty-icon { font-size: 56px; line-height: 1; }
|
| 431 |
+
.bg-empty .bg-empty-title { font-size: 17px; font-weight: 600; color: #374151; }
|
| 432 |
+
.bg-empty .bg-empty-hint { font-size: 14px; max-width: 580px; line-height: 1.6; }
|
| 433 |
+
.bg-empty .bg-empty-hint code { background: #f3f4f6; padding: 1px 6px; border-radius: 4px; font-size: 13px; }
|
| 434 |
+
|
| 435 |
+
/* Compact downloads section */
|
| 436 |
+
.bg-downloads { gap: 6px !important; }
|
| 437 |
+
.bg-downloads .gr-file { min-height: auto !important; }
|
| 438 |
+
.bg-downloads .bg-file-input > label > div {
|
| 439 |
+
height: 52px !important;
|
| 440 |
+
min-height: 52px !important;
|
| 441 |
+
max-height: 52px !important;
|
| 442 |
}
|
| 443 |
|
| 444 |
+
/* Footer */
|
| 445 |
+
.bg-footer {
|
|
|
|
|
|
|
|
|
|
| 446 |
text-align: center;
|
| 447 |
+
margin-top: 18px;
|
| 448 |
+
padding-top: 12px;
|
| 449 |
+
border-top: 1px solid #f1f5f9;
|
| 450 |
+
font-size: 11.5px;
|
| 451 |
+
color: #9ca3af;
|
| 452 |
}
|
| 453 |
+
.bg-footer code { background: #f3f4f6; padding: 1px 5px; border-radius: 3px; font-size: 11px; }
|
| 454 |
+
.bg-footer a { color: #6b7280; text-decoration: none; }
|
| 455 |
+
.bg-footer a:hover { text-decoration: underline; }
|
| 456 |
+
|
| 457 |
+
/* Trim accordion chrome a bit */
|
| 458 |
+
.gr-accordion { border-radius: 10px !important; border: 1px solid #e5e7eb !important; }
|
| 459 |
+
.gr-accordion > .label-wrap { padding: 8px 12px !important; font-size: 13px !important; }
|
| 460 |
+
|
| 461 |
+
@media (prefers-color-scheme: dark) {
|
| 462 |
+
.bg-empty { background: #161b22; border-color: #2a313c; color: #9ca3af; }
|
| 463 |
+
.bg-empty .bg-empty-title { color: #e6edf3; }
|
| 464 |
+
.bg-empty .bg-empty-hint code { background: #21262d; }
|
| 465 |
+
.bg-report-iframe { background: #0d1117; border-color: #2a313c; box-shadow: none; }
|
| 466 |
+
.bg-status { background: #0f172a; border-color: #1e293b; color: #cbd5e1; }
|
| 467 |
+
.bg-status .bg-status-stage { color: #93c5fd; }
|
| 468 |
+
.bg-status .bg-status-detail { color: #94a3b8; }
|
| 469 |
+
.bg-status .bg-status-detail code { background: #1e293b; color: #93c5fd; }
|
| 470 |
+
.bg-status .bg-status-meta { color: #64748b; }
|
| 471 |
+
.bg-status.done { background: #052e1a; border-color: #14532d; }
|
| 472 |
+
.bg-status.done .bg-status-stage { color: #86efac; }
|
| 473 |
+
.bg-status.error { background: #2a0e0e; border-color: #7f1d1d; }
|
| 474 |
+
.bg-preset label { background: #161b22 !important; border-color: #2a313c !important; color: #cbd5e1 !important; }
|
| 475 |
+
.bg-preset label:hover { background: #1e293b !important; }
|
| 476 |
+
.bg-preset .selected { background: #2563eb !important; border-color: #2563eb !important; }
|
| 477 |
+
.bg-footer { border-color: #1e293b; }
|
| 478 |
}
|
| 479 |
+
"""
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
+
EMPTY_PANEL_HTML = """
|
| 483 |
+
<div class="bg-empty">
|
| 484 |
+
<div class="bg-empty-icon">📄</div>
|
| 485 |
+
<div class="bg-empty-title">Your interactive report appears here</div>
|
| 486 |
+
<div class="bg-empty-hint">
|
| 487 |
+
Upload a <code>.bib</code> file and a <code>.tex</code> file in the toolbar above,
|
| 488 |
+
pick a preset, then press <strong>Run check</strong>. The report renders as a
|
| 489 |
+
self-contained HTML page with per-section filters, full-text search,
|
| 490 |
+
inline span highlighting, and dark-mode support.
|
| 491 |
+
</div>
|
| 492 |
+
</div>
|
| 493 |
"""
|
| 494 |
|
| 495 |
+
EMPTY_STATUS_HTML = (
|
| 496 |
+
'<div class="bg-status">'
|
| 497 |
+
'<div class="bg-status-row">'
|
| 498 |
+
'<span class="bg-status-stage">○ Idle</span>'
|
| 499 |
+
'<span class="bg-status-detail">Upload <code>.bib</code> + <code>.tex</code> '
|
| 500 |
+
'and press <strong>Run check</strong> to begin.</span>'
|
| 501 |
+
'</div></div>'
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
def _placeholder(message: str, color: str = "#b91c1c") -> str:
|
| 506 |
+
"""Inline error/info card shown in place of the iframe."""
|
| 507 |
+
return (
|
| 508 |
+
f'<div class="bg-empty" style="color:{color};border-color:{color}33">'
|
| 509 |
+
f'<div class="bg-empty-icon">⚠️</div>'
|
| 510 |
+
f'<div class="bg-empty-title">{message}</div>'
|
| 511 |
+
f'</div>'
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
def _html_to_iframe(html: str) -> str:
|
| 516 |
+
"""
|
| 517 |
+
Embed an HTML document inside ``<iframe srcdoc>``.
|
| 518 |
+
|
| 519 |
+
We escape only ``&`` and ``"`` — these are the two characters that can
|
| 520 |
+
break the attribute value or get re-decoded as entities. ``<`` and ``>``
|
| 521 |
+
must stay raw, otherwise the inner document would be HTML-encoded.
|
| 522 |
+
"""
|
| 523 |
+
escaped = html.replace("&", "&").replace('"', """)
|
| 524 |
+
return (
|
| 525 |
+
f'<iframe class="bg-report-iframe" srcdoc="{escaped}" '
|
| 526 |
+
f'sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" '
|
| 527 |
+
f'loading="lazy"></iframe>'
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
def _status_html(stage: str, detail: str = "", meta: list[str] | None = None,
|
| 532 |
+
state: str = "running") -> str:
|
| 533 |
+
"""Render the live-status strip shown above the report.
|
| 534 |
+
|
| 535 |
+
Layout is a single horizontal row: [stage] [detail] [meta chips].
|
| 536 |
+
Wraps cleanly on narrow screens.
|
| 537 |
+
"""
|
| 538 |
+
if state == "running":
|
| 539 |
+
stage_icon = '<span class="spin"></span>'
|
| 540 |
+
elif state == "done":
|
| 541 |
+
stage_icon = '<span>✓</span>'
|
| 542 |
+
elif state == "error":
|
| 543 |
+
stage_icon = '<span>⚠</span>'
|
| 544 |
+
else:
|
| 545 |
+
stage_icon = '<span>○</span>'
|
| 546 |
+
detail_html = f'<span class="bg-status-detail">{detail}</span>' if detail else '<span class="bg-status-detail"></span>'
|
| 547 |
+
meta_html = ""
|
| 548 |
+
if meta:
|
| 549 |
+
meta_html = (
|
| 550 |
+
'<span class="bg-status-meta">'
|
| 551 |
+
+ " ".join(f"<span>{m}</span>" for m in meta)
|
| 552 |
+
+ "</span>"
|
| 553 |
+
)
|
| 554 |
+
return (
|
| 555 |
+
f'<div class="bg-status {state}">'
|
| 556 |
+
f'<div class="bg-status-row">'
|
| 557 |
+
f'<span class="bg-status-stage">{stage_icon}<span>{stage}</span></span>'
|
| 558 |
+
f'{detail_html}{meta_html}'
|
| 559 |
+
f'</div></div>'
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
# --------------------------------------------------------------- config glue
|
| 564 |
|
| 565 |
def create_config_from_ui(
|
| 566 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 567 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 568 |
+
sentence, consistency, acronym, number, citation_quality, anonymization,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
) -> BibGuardConfig:
|
|
|
|
| 570 |
config = BibGuardConfig()
|
|
|
|
| 571 |
config.bibliography = BibliographyConfig(
|
| 572 |
check_metadata=check_metadata,
|
| 573 |
check_usage=check_usage,
|
| 574 |
check_duplicates=check_duplicates,
|
| 575 |
check_preprint_ratio=check_preprint_ratio,
|
| 576 |
+
check_relevance=False, # LLM disabled in web mode
|
| 577 |
)
|
|
|
|
| 578 |
config.submission = SubmissionConfig(
|
| 579 |
+
caption=caption, reference=reference, formatting=formatting, equation=equation,
|
| 580 |
+
ai_artifacts=ai_artifacts, sentence=sentence, consistency=consistency,
|
| 581 |
+
acronym=acronym, number=number, citation_quality=citation_quality,
|
| 582 |
+
anonymization=anonymization,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
)
|
|
|
|
| 584 |
config.output = OutputConfig(quiet=True, minimal_verified=False)
|
|
|
|
| 585 |
return config
|
| 586 |
|
| 587 |
|
| 588 |
+
def apply_preset(name: str):
|
| 589 |
+
p = PRESETS.get(name, PRESETS["Standard"])
|
| 590 |
+
sub = p["submission"]
|
| 591 |
+
return (
|
| 592 |
+
p["check_metadata"], p["check_usage"], p["check_duplicates"], p["check_preprint_ratio"],
|
| 593 |
+
sub["caption"], sub["reference"], sub["formatting"], sub["equation"],
|
| 594 |
+
sub["ai_artifacts"], sub["sentence"], sub["consistency"], sub["acronym"],
|
| 595 |
+
sub["number"], sub["citation_quality"], sub["anonymization"],
|
| 596 |
+
p["url_liveness"], p["retraction"],
|
| 597 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
|
| 600 |
+
_PRESET_CAPTIONS = {
|
| 601 |
+
"Quick": "local checks only · no network · instant",
|
| 602 |
+
"Standard": "local checks + retraction lookup (CrossRef)",
|
| 603 |
+
"Strict": "+ URL liveness + multi-source metadata (slow)",
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
def _preset_caption_html(name: str) -> str:
|
| 608 |
+
text = _PRESET_CAPTIONS.get(name, "")
|
| 609 |
+
return f'<div class="bg-fname" style="text-align:center">{text}</div>'
|
| 610 |
|
| 611 |
|
| 612 |
+
# ------------------------------------------------------------------ run_check
|
| 613 |
+
# Streaming generator. Each yield is a 7-tuple:
|
| 614 |
+
# (iframe_html, status_html, html_path, md_path, json_path,
|
| 615 |
+
# cleaned_bib_path, log_path)
|
| 616 |
+
# `capture_run` attaches a per-run DEBUG file handler so any exception or
|
| 617 |
+
# warning anywhere in the pipeline is recorded with full traceback at
|
| 618 |
+
# `<out_dir>/bibguard.log`, which is then downloadable. The status panel
|
| 619 |
+
# surfaces warning+error counts so problems aren't invisible.
|
| 620 |
+
|
| 621 |
def run_check(
|
| 622 |
+
bib_file, tex_file,
|
| 623 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 624 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 625 |
+
sentence, consistency, acronym, number, citation_quality, anonymization,
|
| 626 |
+
url_liveness=False, retraction=True,
|
| 627 |
+
):
|
| 628 |
+
"""Run the full check pipeline as a streaming generator with per-run logging.
|
| 629 |
+
|
| 630 |
+
`bib_file` / `tex_file` are filesystem path strings (carried by gr.State),
|
| 631 |
+
not gr.File objects. The status panel is the single source of progress
|
| 632 |
+
feedback — no separate gr.Progress bar.
|
| 633 |
+
"""
|
| 634 |
+
started = time.time()
|
| 635 |
+
|
| 636 |
+
def _elapsed() -> str:
|
| 637 |
+
return f"⏱ {int(time.time() - started)}s"
|
| 638 |
+
|
| 639 |
+
# Initial state: keep current report (None means clear).
|
| 640 |
+
if not bib_file or not tex_file:
|
| 641 |
+
yield (
|
| 642 |
+
_placeholder("Please choose both a .bib and a .tex file in the toolbar."),
|
| 643 |
+
_status_html("Waiting for files",
|
| 644 |
+
"Pick a .bib and a .tex file from the toolbar to start.",
|
| 645 |
+
state="error"),
|
| 646 |
+
None, None, None, None, None,
|
|
|
|
| 647 |
)
|
| 648 |
+
return
|
| 649 |
+
|
| 650 |
+
# Allocate the artifact dir up-front so the per-run log lives next to
|
| 651 |
+
# the report files.
|
| 652 |
+
out_dir = Path(tempfile.mkdtemp(prefix="bibguard_"))
|
| 653 |
+
log_path_target = out_dir / "bibguard.log"
|
| 654 |
+
|
| 655 |
+
# Reset per-source circuit breakers so a previous run's flaky source
|
| 656 |
+
# doesn't carry over and skip valid lookups in this run.
|
| 657 |
+
http_layer.reset_breakers()
|
| 658 |
+
|
| 659 |
+
with capture_run(target_path=log_path_target) as (log_path, log_stats):
|
| 660 |
+
logger.info("=== run_check start: bib=%s tex=%s ===", bib_file, tex_file)
|
| 661 |
+
try:
|
| 662 |
+
yield from _run_check_impl(
|
| 663 |
+
bib_file, tex_file, out_dir, log_path, log_stats,
|
| 664 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 665 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 666 |
+
sentence, consistency, acronym, number, citation_quality, anonymization,
|
| 667 |
+
url_liveness, retraction, started, _elapsed,
|
| 668 |
+
)
|
| 669 |
+
except Exception as e:
|
| 670 |
+
logger.exception("run_check crashed (entry-level guard)")
|
| 671 |
+
yield (
|
| 672 |
+
_placeholder(f"Unhandled error: {e}"),
|
| 673 |
+
_status_html("Failed", f"{e} — see <code>bibguard.log</code> for the full traceback.",
|
| 674 |
+
state="error"),
|
| 675 |
+
None, None, None, None, str(log_path),
|
| 676 |
+
)
|
| 677 |
+
finally:
|
| 678 |
+
logger.info("=== run_check end: warnings=%d errors=%d ===",
|
| 679 |
+
log_stats.warnings, log_stats.errors)
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
def _run_check_impl(
|
| 683 |
+
bib_file, tex_file, out_dir, log_path, log_stats,
|
| 684 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 685 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 686 |
+
sentence, consistency, acronym, number, citation_quality, anonymization,
|
| 687 |
+
url_liveness, retraction, started, _elapsed,
|
| 688 |
+
):
|
| 689 |
+
"""Inner pipeline. Wrapped in `capture_run` by `run_check`.
|
| 690 |
+
|
| 691 |
+
Every yield is a 7-tuple ending with the log path so the user can
|
| 692 |
+
download `bibguard.log` even from intermediate updates.
|
| 693 |
+
"""
|
| 694 |
+
log_path_str = str(log_path)
|
| 695 |
+
|
| 696 |
+
bib_path = Path(bib_file)
|
| 697 |
+
tex_path = Path(tex_file)
|
| 698 |
+
logger.info("Inputs: bib=%s tex=%s out_dir=%s", bib_path, tex_path, out_dir)
|
| 699 |
+
|
| 700 |
+
def _meta_with_logs(extra: list[str]) -> list[str]:
|
| 701 |
+
out = list(extra)
|
| 702 |
+
if log_stats.warnings or log_stats.errors:
|
| 703 |
+
out.append(f"⚠ {log_stats.warnings}w / {log_stats.errors}e logged")
|
| 704 |
+
return out
|
| 705 |
+
|
| 706 |
+
yield (
|
| 707 |
+
gr.update(),
|
| 708 |
+
_status_html("Validating files",
|
| 709 |
+
f"Reading <code>{bib_path.name}</code> and <code>{tex_path.name}</code>",
|
| 710 |
+
meta=_meta_with_logs([_elapsed()])),
|
| 711 |
+
None, None, None, None, log_path_str,
|
| 712 |
+
)
|
| 713 |
+
|
| 714 |
+
# Pre-flight content validation
|
| 715 |
+
bib_rep = validate_bib(bib_path)
|
| 716 |
+
tex_rep = validate_tex(tex_path)
|
| 717 |
+
msg = "\n".join(filter(None, [
|
| 718 |
+
format_report(bib_rep, bib_path.name),
|
| 719 |
+
format_report(tex_rep, tex_path.name),
|
| 720 |
+
]))
|
| 721 |
+
if not bib_rep.ok or not tex_rep.ok:
|
| 722 |
+
logger.error("File validation failed:\n%s", msg)
|
| 723 |
+
block = (
|
| 724 |
+
f'<div class="bg-empty" style="color:#b91c1c;border-color:#b91c1c33">'
|
| 725 |
+
f'<div class="bg-empty-icon">⚠️</div>'
|
| 726 |
+
f'<div class="bg-empty-title">File validation failed</div>'
|
| 727 |
+
f'<pre style="white-space:pre-wrap;font-size:13px;color:#7f1d1d;'
|
| 728 |
+
f'background:#fef2f2;padding:12px;border-radius:6px;max-width:540px">{msg}</pre>'
|
| 729 |
+
f'</div>'
|
| 730 |
)
|
| 731 |
+
yield (
|
| 732 |
+
block,
|
| 733 |
+
_status_html("File validation failed", msg.replace("\n", "<br>"),
|
| 734 |
+
state="error"),
|
| 735 |
+
None, None, None, None, log_path_str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 736 |
)
|
| 737 |
+
return
|
| 738 |
+
elif msg:
|
| 739 |
+
logger.info("Validation warnings:\n%s", msg)
|
| 740 |
+
|
| 741 |
+
config = create_config_from_ui(
|
| 742 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 743 |
+
caption, reference, formatting, equation, ai_artifacts,
|
| 744 |
+
sentence, consistency, acronym, number, citation_quality, anonymization,
|
| 745 |
+
)
|
| 746 |
+
|
| 747 |
+
yield (
|
| 748 |
+
gr.update(),
|
| 749 |
+
_status_html("Parsing", "Loading bibliography and LaTeX source",
|
| 750 |
+
meta=_meta_with_logs([_elapsed()])),
|
| 751 |
+
None, None, None, None, log_path_str,
|
| 752 |
+
)
|
| 753 |
+
|
| 754 |
+
tex_content = tex_path.read_text(encoding='utf-8', errors='replace')
|
| 755 |
+
bib_parser = BibParser()
|
| 756 |
+
entries = bib_parser.parse_file(str(bib_path))
|
| 757 |
+
tex_parser = TexParser()
|
| 758 |
+
tex_parser.parse_file(str(tex_path))
|
| 759 |
+
logger.info("Parsed %d bib entries from %s", len(entries), bib_path.name)
|
| 760 |
+
|
| 761 |
+
bib_config = config.bibliography
|
| 762 |
+
|
| 763 |
+
# Init components
|
| 764 |
+
arxiv_fetcher = crossref_fetcher = ss_fetcher = oa_fetcher = dblp_fetcher = None
|
| 765 |
+
comparator = usage_checker = duplicate_detector = None
|
| 766 |
+
|
| 767 |
+
if bib_config.check_metadata:
|
| 768 |
+
arxiv_fetcher = ArxivFetcher()
|
| 769 |
+
ss_fetcher = SemanticScholarFetcher()
|
| 770 |
+
oa_fetcher = OpenAlexFetcher()
|
| 771 |
+
dblp_fetcher = DBLPFetcher()
|
| 772 |
+
crossref_fetcher = CrossRefFetcher()
|
| 773 |
+
comparator = MetadataComparator()
|
| 774 |
+
if bib_config.check_usage:
|
| 775 |
+
usage_checker = UsageChecker(tex_parser)
|
| 776 |
+
if bib_config.check_duplicates:
|
| 777 |
+
duplicate_detector = DuplicateDetector()
|
| 778 |
+
|
| 779 |
+
report_gen = ReportGenerator(
|
| 780 |
+
minimal_verified=False,
|
| 781 |
+
check_preprint_ratio=bib_config.check_preprint_ratio,
|
| 782 |
+
preprint_warning_threshold=bib_config.preprint_warning_threshold,
|
| 783 |
+
)
|
| 784 |
+
report_gen.set_metadata([str(bib_path)], [str(tex_path)])
|
| 785 |
+
|
| 786 |
+
# Submission quality checks
|
| 787 |
+
yield (
|
| 788 |
+
gr.update(),
|
| 789 |
+
_status_html("LaTeX quality checks",
|
| 790 |
+
f"Running {len(config.submission.get_enabled_checkers())} checkers on the LaTeX source",
|
| 791 |
+
meta=_meta_with_logs([f"📚 {len(entries)} bib entries", _elapsed()])),
|
| 792 |
+
None, None, None, None, log_path_str,
|
| 793 |
+
)
|
| 794 |
+
submission_results = []
|
| 795 |
+
for name in config.submission.get_enabled_checkers():
|
| 796 |
+
if name in CHECKER_REGISTRY:
|
| 797 |
+
try:
|
| 798 |
+
checker = CHECKER_REGISTRY[name]()
|
| 799 |
results = checker.check(tex_content, {})
|
| 800 |
for r in results:
|
| 801 |
+
r.file_path = str(tex_path)
|
| 802 |
submission_results.extend(results)
|
| 803 |
+
except Exception:
|
| 804 |
+
logger.exception("Checker %s crashed", name)
|
| 805 |
+
report_gen.set_submission_results(submission_results, None)
|
| 806 |
+
|
| 807 |
+
if bib_config.check_duplicates and duplicate_detector:
|
| 808 |
+
try:
|
| 809 |
+
report_gen.set_duplicate_groups(duplicate_detector.find_duplicates(entries))
|
| 810 |
+
except Exception:
|
| 811 |
+
logger.exception("Duplicate detection crashed")
|
| 812 |
+
if bib_config.check_usage and usage_checker:
|
| 813 |
+
try:
|
| 814 |
+
report_gen.set_missing_citations(usage_checker.get_missing_entries(entries))
|
| 815 |
+
except Exception:
|
| 816 |
+
logger.exception("Missing-citation lookup crashed")
|
| 817 |
+
|
| 818 |
+
# Per-entry workflow
|
| 819 |
+
total = max(1, len(entries))
|
| 820 |
+
workflow_config = get_default_workflow()
|
| 821 |
+
verified_count = 0
|
| 822 |
+
flagged_count = 0
|
| 823 |
+
not_found_count = 0
|
| 824 |
+
last_yield = time.time()
|
| 825 |
+
|
| 826 |
+
def _identifier_chip(entry) -> str:
|
| 827 |
+
"""Tiny inline hint about which IDs we have for this entry."""
|
| 828 |
+
bits = []
|
| 829 |
+
if entry.doi: bits.append("DOI")
|
| 830 |
+
if entry.has_arxiv: bits.append("arXiv")
|
| 831 |
+
if entry.title and not bits: bits.append("title")
|
| 832 |
+
elif entry.title: bits.append("title")
|
| 833 |
+
return " + ".join(bits) if bits else "no identifiers"
|
| 834 |
+
|
| 835 |
+
def _outcome_label(cmp) -> str:
|
| 836 |
+
if cmp is None:
|
| 837 |
+
return ""
|
| 838 |
+
if cmp.source == "unable":
|
| 839 |
+
return "<span style='color:#b45309'>? no metadata</span>"
|
| 840 |
+
if cmp.is_match:
|
| 841 |
+
return f"<span style='color:#166534'>✓ verified by {cmp.source}</span>"
|
| 842 |
+
return f"<span style='color:#b45309'>⚠ flagged ({cmp.source})</span>"
|
| 843 |
+
|
| 844 |
+
for i, entry in enumerate(entries):
|
| 845 |
+
# ── Pre-fetch status: announce identifier set BEFORE the network roundtrip
|
| 846 |
+
# so the user sees what's being attempted, not just the entry name.
|
| 847 |
+
if bib_config.check_metadata and comparator:
|
| 848 |
+
now = time.time()
|
| 849 |
+
if now - last_yield > 0.4 or i == 0:
|
| 850 |
+
ids = _identifier_chip(entry)
|
| 851 |
+
detail = f"<code>{entry.key}</code> · querying via <strong>{ids}</strong>"
|
| 852 |
+
if entry.title:
|
| 853 |
+
short = entry.title[:70] + ("…" if len(entry.title) > 70 else "")
|
| 854 |
+
detail += f" — <span style='color:#64748b'>{short}</span>"
|
| 855 |
+
yield (
|
| 856 |
+
gr.update(),
|
| 857 |
+
_status_html(
|
| 858 |
+
f"Verifying entry {i + 1}/{total}",
|
| 859 |
+
detail,
|
| 860 |
+
meta=_meta_with_logs([
|
| 861 |
+
f"📚 {total} total",
|
| 862 |
+
f"✓ {verified_count}",
|
| 863 |
+
f"⚠ {flagged_count}",
|
| 864 |
+
f"? {not_found_count}",
|
| 865 |
+
_elapsed(),
|
| 866 |
+
]),
|
| 867 |
+
),
|
| 868 |
+
None, None, None, None, log_path_str,
|
| 869 |
+
)
|
| 870 |
+
last_yield = now
|
| 871 |
+
|
| 872 |
+
usage_result = None
|
| 873 |
+
comparison_result = None
|
| 874 |
+
try:
|
| 875 |
if usage_checker:
|
| 876 |
usage_result = usage_checker.check_usage(entry)
|
| 877 |
+
except Exception:
|
| 878 |
+
logger.exception("Usage check crashed for entry=%s", entry.key)
|
| 879 |
+
try:
|
| 880 |
if bib_config.check_metadata and comparator:
|
| 881 |
comparison_result = fetch_and_compare_with_workflow(
|
| 882 |
entry, workflow_config, arxiv_fetcher, crossref_fetcher,
|
| 883 |
+
ss_fetcher, oa_fetcher, dblp_fetcher, comparator,
|
| 884 |
)
|
| 885 |
+
if comparison_result is None or comparison_result.source == "unable":
|
| 886 |
+
not_found_count += 1
|
| 887 |
+
elif comparison_result.is_match:
|
| 888 |
+
verified_count += 1
|
| 889 |
+
else:
|
| 890 |
+
flagged_count += 1
|
| 891 |
+
except Exception:
|
| 892 |
+
logger.exception("Metadata fetch crashed for entry=%s", entry.key)
|
| 893 |
+
report_gen.add_entry_report(EntryReport(
|
| 894 |
+
entry=entry, comparison=comparison_result,
|
| 895 |
+
usage=usage_result, evaluations=[],
|
| 896 |
+
))
|
| 897 |
+
|
| 898 |
+
# ── Post-fetch status: show outcome inline so the user can watch
|
| 899 |
+
# results stream in (verified / flagged / not found).
|
| 900 |
+
now = time.time()
|
| 901 |
+
if now - last_yield > 0.4 or i == total - 1:
|
| 902 |
+
outcome = _outcome_label(comparison_result)
|
| 903 |
+
detail_parts = [f"<code>{entry.key}</code>"]
|
| 904 |
+
if outcome:
|
| 905 |
+
detail_parts.append(outcome)
|
| 906 |
+
if entry.title:
|
| 907 |
+
short = entry.title[:70] + ("…" if len(entry.title) > 70 else "")
|
| 908 |
+
detail_parts.append(f"<span style='color:#64748b'>{short}</span>")
|
| 909 |
+
detail = " · ".join(detail_parts)
|
| 910 |
+
meta = _meta_with_logs([
|
| 911 |
+
f"📚 {i + 1}/{total}",
|
| 912 |
+
f"✓ {verified_count}",
|
| 913 |
+
f"⚠ {flagged_count}",
|
| 914 |
+
f"? {not_found_count}",
|
| 915 |
+
_elapsed(),
|
| 916 |
+
])
|
| 917 |
+
yield (
|
| 918 |
+
gr.update(),
|
| 919 |
+
_status_html(f"Bibliography {i + 1}/{total}", detail, meta=meta),
|
| 920 |
+
None, None, None, None, log_path_str,
|
| 921 |
)
|
| 922 |
+
last_yield = now
|
| 923 |
+
|
| 924 |
+
if retraction:
|
| 925 |
+
try:
|
| 926 |
+
doi_count = sum(1 for e in entries if getattr(e, "doi", ""))
|
| 927 |
+
yield (
|
| 928 |
+
gr.update(),
|
| 929 |
+
_status_html("Retraction lookups",
|
| 930 |
+
f"Querying CrossRef for {doi_count} DOI(s)",
|
| 931 |
+
meta=_meta_with_logs([_elapsed()])),
|
| 932 |
+
None, None, None, None, log_path_str,
|
| 933 |
+
)
|
| 934 |
+
report_gen.set_retraction_findings(RetractionChecker().check_entries(entries))
|
| 935 |
+
except Exception:
|
| 936 |
+
logger.exception("Retraction lookup crashed")
|
| 937 |
+
|
| 938 |
+
if url_liveness:
|
| 939 |
+
try:
|
| 940 |
+
url_count = sum(1 for e in entries if getattr(e, "url", ""))
|
| 941 |
+
yield (
|
| 942 |
+
gr.update(),
|
| 943 |
+
_status_html("URL liveness",
|
| 944 |
+
f"HEAD-checking {url_count} URL(s) in parallel",
|
| 945 |
+
meta=_meta_with_logs([_elapsed()])),
|
| 946 |
+
None, None, None, None, log_path_str,
|
| 947 |
+
)
|
| 948 |
+
report_gen.set_url_findings(URLChecker().check_entries(entries))
|
| 949 |
+
except Exception:
|
| 950 |
+
logger.exception("URL liveness crashed")
|
| 951 |
+
|
| 952 |
+
# Save artifacts
|
| 953 |
+
yield (
|
| 954 |
+
gr.update(),
|
| 955 |
+
_status_html("Building report",
|
| 956 |
+
"Rendering self-contained HTML, JSON, and Markdown",
|
| 957 |
+
meta=_meta_with_logs([_elapsed()])),
|
| 958 |
+
None, None, None, None, log_path_str,
|
| 959 |
+
)
|
| 960 |
+
html_path = out_dir / "report.html"
|
| 961 |
+
md_path = out_dir / "bibliography_report.md"
|
| 962 |
+
json_path = out_dir / "report.json"
|
| 963 |
+
cleaned_bib_path: Path | None = None
|
| 964 |
|
| 965 |
+
try:
|
| 966 |
+
report_gen.save_html(str(html_path))
|
| 967 |
+
report_gen.save_bibliography_report(str(md_path))
|
| 968 |
+
report_gen.save_json(str(json_path))
|
| 969 |
+
if usage_checker:
|
| 970 |
+
used_keys = {er.entry.key for er in report_gen.entries if er.usage and er.usage.is_used}
|
| 971 |
+
if used_keys:
|
| 972 |
+
cleaned_bib_path = out_dir / f"{bib_path.stem}_only_used.bib"
|
| 973 |
+
bib_parser.filter_file(str(bib_path), str(cleaned_bib_path), used_keys)
|
| 974 |
+
except Exception:
|
| 975 |
+
logger.exception("Artifact generation failed")
|
| 976 |
|
| 977 |
+
# Embed report.html as iframe srcdoc
|
| 978 |
+
if html_path.exists():
|
| 979 |
+
iframe_html = _html_to_iframe(html_path.read_text(encoding='utf-8'))
|
| 980 |
+
else:
|
| 981 |
+
iframe_html = _placeholder("Report generation failed — see bibguard.log.")
|
| 982 |
+
|
| 983 |
+
meta = _meta_with_logs([
|
| 984 |
+
f"📚 {len(entries)} entries",
|
| 985 |
+
f"✓ {verified_count} verified",
|
| 986 |
+
f"⚠ {flagged_count} flagged",
|
| 987 |
+
_elapsed(),
|
| 988 |
+
])
|
| 989 |
+
state = "done"
|
| 990 |
+
summary = "Report ready. Use the right pane to filter, search, and copy fixes."
|
| 991 |
+
if log_stats.errors > 0:
|
| 992 |
+
state = "error"
|
| 993 |
+
summary = (f"Done with {log_stats.errors} error(s) and {log_stats.warnings} warning(s) "
|
| 994 |
+
"logged — see <code>bibguard.log</code> for full tracebacks.")
|
| 995 |
+
elif log_stats.warnings > 0:
|
| 996 |
+
summary = (f"Report ready ({log_stats.warnings} warnings logged — see "
|
| 997 |
+
"<code>bibguard.log</code>).")
|
| 998 |
+
|
| 999 |
+
yield (
|
| 1000 |
+
iframe_html,
|
| 1001 |
+
_status_html("Done", summary, meta=meta, state=state),
|
| 1002 |
+
str(html_path) if html_path.exists() else None,
|
| 1003 |
+
str(md_path) if md_path.exists() else None,
|
| 1004 |
+
str(json_path) if json_path.exists() else None,
|
| 1005 |
+
str(cleaned_bib_path) if (cleaned_bib_path and cleaned_bib_path.exists()) else None,
|
| 1006 |
+
log_path_str,
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
|
| 1010 |
+
# --------------------------------------------------------------------- layout
|
| 1011 |
|
| 1012 |
+
def create_app() -> gr.Blocks:
|
| 1013 |
+
# Inline app icon as a base64 data URL — works regardless of cwd.
|
| 1014 |
+
icon_html = '<span style="font-size:28px">🛡️</span>'
|
|
|
|
|
|
|
| 1015 |
try:
|
| 1016 |
+
icon_path = Path(__file__).parent / "assets" / "icon-192.png"
|
| 1017 |
if icon_path.exists():
|
| 1018 |
with open(icon_path, "rb") as f:
|
| 1019 |
+
b64 = base64.b64encode(f.read()).decode()
|
| 1020 |
+
icon_html = (
|
| 1021 |
+
f'<img src="data:image/png;base64,{b64}" '
|
| 1022 |
+
f'style="width:32px;height:32px;border-radius:6px" alt="BibGuard">'
|
| 1023 |
+
)
|
| 1024 |
+
except Exception as e:
|
| 1025 |
+
logger.debug("Icon load failed; using emoji fallback: %s", e, exc_info=True)
|
| 1026 |
+
|
| 1027 |
+
with gr.Blocks(
|
| 1028 |
+
title="BibGuard — Bibliography & LaTeX Quality Auditor",
|
| 1029 |
+
) as app:
|
| 1030 |
+
|
| 1031 |
+
gr.HTML(f"""
|
| 1032 |
+
<div class="bg-header" style="display:flex;align-items:center;gap:10px">
|
| 1033 |
+
{icon_html}
|
| 1034 |
+
<strong style="font-size:18px">BibGuard</strong>
|
| 1035 |
+
<span style="color:#6b7280;font-size:13px">— Bibliography & LaTeX quality auditor</span>
|
| 1036 |
+
<span style="flex:1"></span>
|
| 1037 |
+
<a href="https://github.com/thinkwee/BibGuard" target="_blank"
|
| 1038 |
+
style="color:#6b7280;text-decoration:none;font-size:13px">GitHub ↗</a>
|
| 1039 |
+
</div>
|
| 1040 |
+
""")
|
| 1041 |
+
|
| 1042 |
+
# ───────────────────────── Top toolbar ─────────────────────────
|
| 1043 |
+
# All primary controls on a single horizontal row, every primary
|
| 1044 |
+
# widget pinned to 56px height. gr.UploadButton replaces gr.File
|
| 1045 |
+
# because the latter's drop-zone doesn't shrink to a toolbar.
|
| 1046 |
+
with gr.Row(elem_classes=["bg-toolbar"]):
|
| 1047 |
+
with gr.Column(scale=2, min_width=200):
|
| 1048 |
+
bib_btn = gr.UploadButton(
|
| 1049 |
+
"📚 Choose .bib file",
|
| 1050 |
+
file_types=[".bib"], file_count="single",
|
| 1051 |
+
elem_classes=["bg-upload-btn"],
|
| 1052 |
+
)
|
| 1053 |
+
bib_status = gr.HTML('<div class="bg-fname">no file selected</div>')
|
| 1054 |
+
with gr.Column(scale=2, min_width=200):
|
| 1055 |
+
tex_btn = gr.UploadButton(
|
| 1056 |
+
"📄 Choose .tex file",
|
| 1057 |
+
file_types=[".tex"], file_count="single",
|
| 1058 |
+
elem_classes=["bg-upload-btn"],
|
| 1059 |
)
|
| 1060 |
+
tex_status = gr.HTML('<div class="bg-fname">no file selected</div>')
|
| 1061 |
+
with gr.Column(scale=3, min_width=280):
|
| 1062 |
+
preset = gr.Radio(
|
| 1063 |
+
choices=list(PRESETS.keys()),
|
| 1064 |
+
value="Standard",
|
| 1065 |
+
show_label=False,
|
| 1066 |
+
elem_classes=["bg-preset"],
|
| 1067 |
)
|
| 1068 |
+
preset_caption = gr.HTML(
|
| 1069 |
+
_preset_caption_html("Standard"),
|
| 1070 |
+
)
|
| 1071 |
+
with gr.Column(scale=1, min_width=140):
|
| 1072 |
+
run_btn = gr.Button("▶ Run check", variant="primary",
|
| 1073 |
+
elem_classes=["bg-run-btn"])
|
| 1074 |
+
stop_btn = gr.Button("◼ Stop", variant="stop",
|
| 1075 |
+
elem_classes=["bg-run-btn", "bg-stop-btn"],
|
| 1076 |
+
visible=False)
|
| 1077 |
+
gr.HTML('<div class="bg-fname" style="text-align:center"> </div>')
|
| 1078 |
+
|
| 1079 |
+
# Holds the selected file paths (strings). Updated by the UploadButton
|
| 1080 |
+
# callbacks below so run_check sees plain paths regardless of how the
|
| 1081 |
+
# user picked the files.
|
| 1082 |
+
bib_path_state = gr.State(value=None)
|
| 1083 |
+
tex_path_state = gr.State(value=None)
|
| 1084 |
+
|
| 1085 |
+
# Advanced fine-grained toggles. Default closed — most users just
|
| 1086 |
+
# pick a preset and go. Each tab is composed of gr.Row blocks of
|
| 1087 |
+
# exactly 4 cells so columns line up vertically. Short rows are
|
| 1088 |
+
# padded with invisible spacer HTML.
|
| 1089 |
+
def _spacer():
|
| 1090 |
+
return gr.HTML('<div class="bg-row-spacer"> </div>',
|
| 1091 |
+
elem_classes=["bg-row-spacer"])
|
| 1092 |
+
|
| 1093 |
+
with gr.Accordion("⚙️ Advanced settings", open=False):
|
| 1094 |
+
with gr.Tabs():
|
| 1095 |
+
with gr.TabItem("Bibliography"):
|
| 1096 |
+
with gr.Row(elem_classes=["bg-row"]):
|
| 1097 |
+
check_metadata = gr.Checkbox(label="Metadata verify", value=False)
|
| 1098 |
+
check_usage = gr.Checkbox(label="Usage", value=True)
|
| 1099 |
+
check_duplicates = gr.Checkbox(label="Duplicates", value=True)
|
| 1100 |
+
check_preprint_ratio = gr.Checkbox(label="Preprints", value=True)
|
| 1101 |
+
with gr.Row(elem_classes=["bg-row"]):
|
| 1102 |
+
retraction = gr.Checkbox(label="Retractions", value=True)
|
| 1103 |
+
url_liveness = gr.Checkbox(label="URL liveness", value=False)
|
| 1104 |
+
_spacer()
|
| 1105 |
+
_spacer()
|
| 1106 |
+
|
| 1107 |
+
with gr.TabItem("LaTeX format"):
|
| 1108 |
+
with gr.Row(elem_classes=["bg-row"]):
|
| 1109 |
+
caption = gr.Checkbox(label="Captions", value=True)
|
| 1110 |
+
reference = gr.Checkbox(label="References", value=True)
|
| 1111 |
+
formatting = gr.Checkbox(label="Formatting", value=True)
|
| 1112 |
+
equation = gr.Checkbox(label="Equations", value=True)
|
| 1113 |
+
|
| 1114 |
+
with gr.TabItem("Writing"):
|
| 1115 |
+
with gr.Row(elem_classes=["bg-row"]):
|
| 1116 |
+
ai_artifacts = gr.Checkbox(label="AI artifacts", value=True)
|
| 1117 |
+
sentence = gr.Checkbox(label="Sentences", value=True)
|
| 1118 |
+
consistency = gr.Checkbox(label="Consistency", value=True)
|
| 1119 |
+
acronym = gr.Checkbox(label="Acronyms", value=True)
|
| 1120 |
+
with gr.Row(elem_classes=["bg-row"]):
|
| 1121 |
+
number = gr.Checkbox(label="Numbers", value=True)
|
| 1122 |
+
citation_quality = gr.Checkbox(label="Citations", value=True)
|
| 1123 |
+
anonymization = gr.Checkbox(label="Anonymization", value=True)
|
| 1124 |
+
_spacer()
|
| 1125 |
+
|
| 1126 |
+
# ───────────────────────── Status strip ─────────────────────────
|
| 1127 |
+
status_panel = gr.HTML(value=EMPTY_STATUS_HTML, elem_id="bg-status-wrap")
|
| 1128 |
+
|
| 1129 |
+
# ───────────────────────── Report (full width) ───────────────────
|
| 1130 |
+
with gr.Row(elem_classes=["bg-main"]):
|
| 1131 |
+
report_panel = gr.HTML(value=EMPTY_PANEL_HTML)
|
| 1132 |
+
|
| 1133 |
+
# ───────────────────────── Downloads ────────────────────────────
|
| 1134 |
+
with gr.Accordion("📥 Downloads", open=False):
|
| 1135 |
+
with gr.Row(elem_classes=["bg-downloads"]):
|
| 1136 |
+
download_html = gr.File(label="report.html (offline)",
|
| 1137 |
+
interactive=False, elem_classes=["bg-file-input"])
|
| 1138 |
+
download_md = gr.File(label="bibliography_report.md",
|
| 1139 |
+
interactive=False, elem_classes=["bg-file-input"])
|
| 1140 |
+
download_json = gr.File(label="report.json",
|
| 1141 |
+
interactive=False, elem_classes=["bg-file-input"])
|
| 1142 |
+
download_bib = gr.File(label="cleaned .bib",
|
| 1143 |
+
interactive=False, elem_classes=["bg-file-input"])
|
| 1144 |
+
download_log = gr.File(label="bibguard.log",
|
| 1145 |
+
interactive=False, elem_classes=["bg-file-input"])
|
| 1146 |
+
|
| 1147 |
+
gr.HTML(
|
| 1148 |
+
'<div class="bg-footer">'
|
| 1149 |
+
'Set <code>$BIBGUARD_CONTACT_EMAIL</code> for the polite-pool User-Agent · '
|
| 1150 |
+
f'persistent log at <code>{LOG_PATH}</code> · '
|
| 1151 |
+
'set <code>BIBGUARD_DEBUG=1</code> for verbose console output.'
|
| 1152 |
+
'</div>'
|
| 1153 |
+
)
|
| 1154 |
+
|
| 1155 |
+
preset.change(
|
| 1156 |
+
fn=apply_preset,
|
| 1157 |
+
inputs=[preset],
|
| 1158 |
+
outputs=[
|
| 1159 |
+
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 1160 |
+
caption, reference, formatting, equation,
|
| 1161 |
+
ai_artifacts, sentence, consistency, acronym,
|
| 1162 |
+
number, citation_quality, anonymization,
|
| 1163 |
+
url_liveness, retraction,
|
| 1164 |
+
],
|
| 1165 |
+
)
|
| 1166 |
+
preset.change(
|
| 1167 |
+
fn=_preset_caption_html,
|
| 1168 |
+
inputs=[preset],
|
| 1169 |
+
outputs=[preset_caption],
|
| 1170 |
+
)
|
| 1171 |
+
|
| 1172 |
+
# ---- Upload-button callbacks: store path in state + update chip ----
|
| 1173 |
+
|
| 1174 |
+
def _on_bib_upload(f):
|
| 1175 |
+
if f is None:
|
| 1176 |
+
return None, '<div class="bg-fname">no file selected</div>'
|
| 1177 |
+
path = getattr(f, "name", str(f))
|
| 1178 |
+
return path, f'<div class="bg-fname ok">📚 {Path(path).name}</div>'
|
| 1179 |
+
|
| 1180 |
+
def _on_tex_upload(f):
|
| 1181 |
+
if f is None:
|
| 1182 |
+
return None, '<div class="bg-fname">no file selected</div>'
|
| 1183 |
+
path = getattr(f, "name", str(f))
|
| 1184 |
+
return path, f'<div class="bg-fname ok">📄 {Path(path).name}</div>'
|
| 1185 |
+
|
| 1186 |
+
bib_btn.upload(_on_bib_upload, inputs=[bib_btn], outputs=[bib_path_state, bib_status])
|
| 1187 |
+
tex_btn.upload(_on_tex_upload, inputs=[tex_btn], outputs=[tex_path_state, tex_status])
|
| 1188 |
+
|
| 1189 |
+
# Run pipeline:
|
| 1190 |
+
# 1. Toggle visibility: hide Run, show Stop.
|
| 1191 |
+
# 2. Stream run_check yields into report + status + downloads.
|
| 1192 |
+
# 3. After completion, swap buttons back.
|
| 1193 |
+
# Stop button cancels the streaming task via Gradio's `cancels=`.
|
| 1194 |
+
def _show_stop():
|
| 1195 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 1196 |
+
|
| 1197 |
+
def _show_run():
|
| 1198 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 1199 |
+
|
| 1200 |
+
run_event = run_btn.click(
|
| 1201 |
+
fn=_show_stop, inputs=None, outputs=[run_btn, stop_btn],
|
| 1202 |
+
).then(
|
| 1203 |
fn=run_check,
|
| 1204 |
inputs=[
|
| 1205 |
+
bib_path_state, tex_path_state,
|
| 1206 |
check_metadata, check_usage, check_duplicates, check_preprint_ratio,
|
| 1207 |
caption, reference, formatting, equation, ai_artifacts,
|
| 1208 |
+
sentence, consistency, acronym, number, citation_quality, anonymization,
|
| 1209 |
+
url_liveness, retraction,
|
| 1210 |
],
|
| 1211 |
+
outputs=[report_panel, status_panel,
|
| 1212 |
+
download_html, download_md, download_json, download_bib, download_log],
|
| 1213 |
+
).then(
|
| 1214 |
+
fn=_show_run, inputs=None, outputs=[run_btn, stop_btn],
|
| 1215 |
+
)
|
| 1216 |
+
|
| 1217 |
+
stop_btn.click(
|
| 1218 |
+
fn=lambda: (
|
| 1219 |
+
gr.update(visible=True),
|
| 1220 |
+
gr.update(visible=False),
|
| 1221 |
+
_status_html("Cancelled",
|
| 1222 |
+
"Run interrupted by user. Partial results discarded.",
|
| 1223 |
+
state="error"),
|
| 1224 |
+
),
|
| 1225 |
+
inputs=None,
|
| 1226 |
+
outputs=[run_btn, stop_btn, status_panel],
|
| 1227 |
+
cancels=[run_event],
|
| 1228 |
)
|
| 1229 |
+
|
| 1230 |
return app
|
| 1231 |
|
| 1232 |
|
|
|
|
| 1233 |
app = create_app()
|
| 1234 |
|
| 1235 |
+
|
| 1236 |
if __name__ == "__main__":
|
| 1237 |
+
_favicon = Path(__file__).parent / "assets" / "icon-192.png"
|
| 1238 |
app.launch(
|
| 1239 |
+
favicon_path=str(_favicon) if _favicon.exists() else None,
|
| 1240 |
show_error=True,
|
| 1241 |
css=CUSTOM_CSS,
|
| 1242 |
+
theme=gr.themes.Soft(),
|
| 1243 |
)
|
app_helper.py
CHANGED
|
@@ -1,98 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def fetch_and_compare_with_workflow(
|
| 2 |
-
entry,
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
):
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
#
|
| 50 |
-
if
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
if
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
#
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
#
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Per-entry metadata verification: parallel multi-source lookup with corroboration.
|
| 3 |
+
|
| 4 |
+
Strategy (in order):
|
| 5 |
+
1. **Identifier lookups, in parallel**:
|
| 6 |
+
- DOI → CrossRef, Semantic Scholar, OpenAlex
|
| 7 |
+
- arXiv ID → arXiv, Semantic Scholar
|
| 8 |
+
If the bib entry has either, this stage usually returns 2-3 independent
|
| 9 |
+
hits within a few hundred ms. Identifier lookups are far more reliable
|
| 10 |
+
than title search because the identifier is unique.
|
| 11 |
+
|
| 12 |
+
2. **Title searches across sources, in parallel** (always run as corroboration,
|
| 13 |
+
even if identifiers were found): Semantic Scholar, OpenAlex, DBLP, CrossRef,
|
| 14 |
+
arXiv. Each source returns top-K candidates; we keep the candidate whose
|
| 15 |
+
title most closely matches the bib title.
|
| 16 |
+
|
| 17 |
+
3. **Score & corroborate**:
|
| 18 |
+
- Pick the result with the highest per-source confidence.
|
| 19 |
+
- If ≥2 sources independently report the same title (sim ≥ 0.95) we
|
| 20 |
+
mark `is_match=True` even when individual confidences are middling
|
| 21 |
+
— multi-source agreement is the single strongest signal.
|
| 22 |
+
- Tightened thresholds: title sim ≥ 0.88 + year diff ≤ 1 (or year empty)
|
| 23 |
+
to declare a single-source match. Single-source matches that disagree
|
| 24 |
+
with corroborating sources are downgraded.
|
| 25 |
+
|
| 26 |
+
The function still returns a single ComparisonResult so the rest of the
|
| 27 |
+
pipeline doesn't change. Extra evidence (sources tried, agreement count) is
|
| 28 |
+
stuffed into the `issues` field as informational notes when relevant.
|
| 29 |
+
"""
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
import concurrent.futures as cf
|
| 33 |
+
import logging
|
| 34 |
+
from typing import List, Optional, Tuple
|
| 35 |
+
|
| 36 |
+
from src.utils.normalizer import TextNormalizer
|
| 37 |
+
|
| 38 |
+
logger = logging.getLogger(__name__)
|
| 39 |
+
|
| 40 |
+
# Year tolerance for "match" (preprint vs published often differ by 1y).
|
| 41 |
+
_YEAR_TOL = 1
|
| 42 |
+
# Title similarity required for single-source match.
|
| 43 |
+
_TITLE_MATCH_TIGHT = 0.88
|
| 44 |
+
# Title similarity required to count as "corroborating" another source.
|
| 45 |
+
_TITLE_AGREE = 0.95
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _title_sim(a: str, b: str) -> float:
|
| 49 |
+
if not a or not b:
|
| 50 |
+
return 0.0
|
| 51 |
+
a_n = TextNormalizer.normalize_for_comparison(a)
|
| 52 |
+
b_n = TextNormalizer.normalize_for_comparison(b)
|
| 53 |
+
if not a_n or not b_n:
|
| 54 |
+
return 0.0
|
| 55 |
+
jacc = TextNormalizer.similarity_ratio(a_n, b_n)
|
| 56 |
+
if max(len(a_n), len(b_n)) < 200:
|
| 57 |
+
lev = TextNormalizer.levenshtein_similarity(a_n, b_n)
|
| 58 |
+
return max(jacc, lev)
|
| 59 |
+
return jacc
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _year_close(y1: str, y2: str) -> bool:
|
| 63 |
+
"""True if years are missing on either side or within ±1."""
|
| 64 |
+
y1, y2 = (y1 or "").strip(), (y2 or "").strip()
|
| 65 |
+
if not y1 or not y2:
|
| 66 |
+
return True
|
| 67 |
+
try:
|
| 68 |
+
return abs(int(y1[:4]) - int(y2[:4])) <= _YEAR_TOL
|
| 69 |
+
except ValueError:
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _pick_best_candidate(bib_title: str, candidates: list) -> Tuple[Optional[object], float]:
|
| 74 |
+
"""Pick the candidate whose title most closely matches `bib_title`."""
|
| 75 |
+
best, best_sim = None, 0.0
|
| 76 |
+
for c in candidates:
|
| 77 |
+
sim = _title_sim(bib_title, getattr(c, "title", "") or "")
|
| 78 |
+
if sim > best_sim:
|
| 79 |
+
best, best_sim = c, sim
|
| 80 |
+
return best, best_sim
|
| 81 |
+
|
| 82 |
+
|
| 83 |
def fetch_and_compare_with_workflow(
|
| 84 |
+
entry,
|
| 85 |
+
workflow_steps, # accepted for API compat; ignored — strategy is fixed
|
| 86 |
+
arxiv_fetcher,
|
| 87 |
+
crossref_fetcher,
|
| 88 |
+
semantic_scholar_fetcher,
|
| 89 |
+
openalex_fetcher,
|
| 90 |
+
dblp_fetcher,
|
| 91 |
+
comparator,
|
| 92 |
):
|
| 93 |
+
"""Look up `entry` across all available sources in parallel and return a single ComparisonResult."""
|
| 94 |
+
has_doi = bool(getattr(entry, "doi", "") or "")
|
| 95 |
+
has_arxiv = bool(getattr(entry, "has_arxiv", False))
|
| 96 |
+
has_title = bool(getattr(entry, "title", "") or "")
|
| 97 |
+
|
| 98 |
+
if not (has_doi or has_arxiv or has_title):
|
| 99 |
+
return comparator.create_unable_result(entry, "Entry has no DOI, arXiv ID, or title to look up")
|
| 100 |
+
|
| 101 |
+
# ------------------------------------------------------------------ stage 1
|
| 102 |
+
# Tasks are tuples of (source_name, callable returning ComparisonResult or None).
|
| 103 |
+
tasks: list[tuple[str, callable]] = []
|
| 104 |
+
|
| 105 |
+
# Identifier-based lookups (high precision).
|
| 106 |
+
if has_doi and crossref_fetcher:
|
| 107 |
+
def _t_cr_doi(e=entry):
|
| 108 |
+
r = crossref_fetcher.search_by_doi(e.doi)
|
| 109 |
+
return comparator.compare_with_crossref(e, r) if r else None
|
| 110 |
+
tasks.append(("crossref(doi)", _t_cr_doi))
|
| 111 |
+
|
| 112 |
+
if has_doi and semantic_scholar_fetcher:
|
| 113 |
+
def _t_s2_doi(e=entry):
|
| 114 |
+
r = semantic_scholar_fetcher.fetch_by_doi(e.doi)
|
| 115 |
+
return comparator.compare_with_semantic_scholar(e, r) if r else None
|
| 116 |
+
tasks.append(("s2(doi)", _t_s2_doi))
|
| 117 |
+
|
| 118 |
+
if has_doi and openalex_fetcher:
|
| 119 |
+
def _t_oa_doi(e=entry):
|
| 120 |
+
r = openalex_fetcher.fetch_by_doi(e.doi)
|
| 121 |
+
return comparator.compare_with_openalex(e, r) if r else None
|
| 122 |
+
tasks.append(("openalex(doi)", _t_oa_doi))
|
| 123 |
+
|
| 124 |
+
if has_arxiv and arxiv_fetcher:
|
| 125 |
+
def _t_arxiv_id(e=entry):
|
| 126 |
+
r = arxiv_fetcher.fetch_by_id(e.arxiv_id)
|
| 127 |
+
return comparator.compare_with_arxiv(e, r) if r else None
|
| 128 |
+
tasks.append(("arxiv(id)", _t_arxiv_id))
|
| 129 |
+
|
| 130 |
+
if has_arxiv and semantic_scholar_fetcher and not has_doi:
|
| 131 |
+
# If we already queried S2 by DOI we don't double-bill.
|
| 132 |
+
def _t_s2_arxiv(e=entry):
|
| 133 |
+
r = semantic_scholar_fetcher.fetch_by_arxiv_id(e.arxiv_id)
|
| 134 |
+
return comparator.compare_with_semantic_scholar(e, r) if r else None
|
| 135 |
+
tasks.append(("s2(arxiv)", _t_s2_arxiv))
|
| 136 |
+
|
| 137 |
+
# Title-based lookups (always run as corroboration if title available).
|
| 138 |
+
if has_title:
|
| 139 |
+
if semantic_scholar_fetcher and not has_doi and not has_arxiv:
|
| 140 |
+
def _t_s2_title(e=entry):
|
| 141 |
+
cands = semantic_scholar_fetcher.search_by_title_multi(e.title, max_results=5)
|
| 142 |
+
best, _ = _pick_best_candidate(e.title, cands)
|
| 143 |
+
return comparator.compare_with_semantic_scholar(e, best) if best else None
|
| 144 |
+
tasks.append(("s2(title)", _t_s2_title))
|
| 145 |
+
|
| 146 |
+
if openalex_fetcher and not has_doi:
|
| 147 |
+
def _t_oa_title(e=entry):
|
| 148 |
+
cands = openalex_fetcher.search_by_title_multi(e.title, max_results=5)
|
| 149 |
+
best, _ = _pick_best_candidate(e.title, cands)
|
| 150 |
+
return comparator.compare_with_openalex(e, best) if best else None
|
| 151 |
+
tasks.append(("openalex(title)", _t_oa_title))
|
| 152 |
+
|
| 153 |
+
if dblp_fetcher:
|
| 154 |
+
def _t_dblp_title(e=entry):
|
| 155 |
+
cands = dblp_fetcher.search_by_title_multi(e.title, max_results=5)
|
| 156 |
+
best, _ = _pick_best_candidate(e.title, cands)
|
| 157 |
+
return comparator.compare_with_dblp(e, best) if best else None
|
| 158 |
+
tasks.append(("dblp(title)", _t_dblp_title))
|
| 159 |
+
|
| 160 |
+
if crossref_fetcher and not has_doi:
|
| 161 |
+
def _t_cr_title(e=entry):
|
| 162 |
+
cands = crossref_fetcher.search_by_title_multi(e.title, max_results=5)
|
| 163 |
+
best, _ = _pick_best_candidate(e.title, cands)
|
| 164 |
+
return comparator.compare_with_crossref(e, best) if best else None
|
| 165 |
+
tasks.append(("crossref(title)", _t_cr_title))
|
| 166 |
+
|
| 167 |
+
if arxiv_fetcher and not has_arxiv:
|
| 168 |
+
def _t_arxiv_title(e=entry):
|
| 169 |
+
cands = arxiv_fetcher.search_by_title(e.title, max_results=5)
|
| 170 |
+
best, _ = _pick_best_candidate(e.title, cands)
|
| 171 |
+
return comparator.compare_with_arxiv(e, best) if best else None
|
| 172 |
+
tasks.append(("arxiv(title)", _t_arxiv_title))
|
| 173 |
+
|
| 174 |
+
if not tasks:
|
| 175 |
+
return comparator.create_unable_result(entry, "No fetchers configured")
|
| 176 |
+
|
| 177 |
+
# Run in parallel with EARLY EXIT.
|
| 178 |
+
#
|
| 179 |
+
# Strategy:
|
| 180 |
+
# - Submit every task to a pool.
|
| 181 |
+
# - Drain `as_completed` with a SHORT poll deadline.
|
| 182 |
+
# - Stop early as soon as we have one high-confidence match (≥0.85)
|
| 183 |
+
# plus at least one corroborating result whose title aligns.
|
| 184 |
+
# - Hard ceiling: 18s total wall-clock per entry. Whatever finished
|
| 185 |
+
# by then is what we use; the rest is cancelled so we don't pay
|
| 186 |
+
# the slowest-source penalty (a 80s-rate-limited S2 retry, e.g.).
|
| 187 |
+
results: list = []
|
| 188 |
+
sources_tried: list[str] = []
|
| 189 |
+
entry_key = getattr(entry, "key", "<unknown>")
|
| 190 |
+
deadline = __import__("time").monotonic() + 18.0
|
| 191 |
+
HIGH_CONF = 0.85
|
| 192 |
+
|
| 193 |
+
def _have_corroborated(rs: list) -> bool:
|
| 194 |
+
if not rs:
|
| 195 |
+
return False
|
| 196 |
+
rs_sorted = sorted(rs, key=lambda r: r.confidence, reverse=True)
|
| 197 |
+
primary = rs_sorted[0]
|
| 198 |
+
if primary.confidence < HIGH_CONF:
|
| 199 |
+
return False
|
| 200 |
+
for other in rs_sorted[1:]:
|
| 201 |
+
if other.fetched_title and _title_sim(primary.fetched_title,
|
| 202 |
+
other.fetched_title) >= _TITLE_AGREE:
|
| 203 |
+
return True
|
| 204 |
+
return False
|
| 205 |
+
|
| 206 |
+
pool = cf.ThreadPoolExecutor(max_workers=min(8, len(tasks)))
|
| 207 |
+
future_to_name = {pool.submit(fn): name for name, fn in tasks}
|
| 208 |
+
try:
|
| 209 |
+
pending = set(future_to_name)
|
| 210 |
+
while pending:
|
| 211 |
+
remaining = deadline - __import__("time").monotonic()
|
| 212 |
+
if remaining <= 0:
|
| 213 |
+
logger.debug("Entry=%s: 18s deadline reached, %d sources still pending",
|
| 214 |
+
entry_key, len(pending))
|
| 215 |
+
break
|
| 216 |
+
done, pending = cf.wait(pending, timeout=min(remaining, 2.0),
|
| 217 |
+
return_when=cf.FIRST_COMPLETED)
|
| 218 |
+
for fut in done:
|
| 219 |
+
name = future_to_name[fut]
|
| 220 |
+
sources_tried.append(name)
|
| 221 |
+
try:
|
| 222 |
+
r = fut.result(timeout=0)
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.warning(
|
| 225 |
+
"Lookup failed for entry=%s source=%s: %s",
|
| 226 |
+
entry_key, name, e, exc_info=True,
|
| 227 |
+
)
|
| 228 |
+
continue
|
| 229 |
+
if r is not None:
|
| 230 |
+
results.append(r)
|
| 231 |
+
if _have_corroborated(results):
|
| 232 |
+
logger.debug("Entry=%s: corroborated early after %d sources", entry_key, len(results))
|
| 233 |
+
break
|
| 234 |
+
finally:
|
| 235 |
+
# Cancel anything still in the queue; threads already running can't
|
| 236 |
+
# be killed, but they'll finish quietly without blocking us.
|
| 237 |
+
for fut in future_to_name:
|
| 238 |
+
if not fut.done():
|
| 239 |
+
fut.cancel()
|
| 240 |
+
pool.shutdown(wait=False, cancel_futures=True)
|
| 241 |
+
|
| 242 |
+
if not results:
|
| 243 |
+
return comparator.create_unable_result(
|
| 244 |
+
entry,
|
| 245 |
+
f"Tried {len(tasks)} sources ({', '.join(sources_tried) or 'none'}) — no metadata returned"
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# ------------------------------------------------------------------ stage 2: pick + corroborate
|
| 249 |
+
# Sort by confidence; pick top.
|
| 250 |
+
results.sort(key=lambda r: r.confidence, reverse=True)
|
| 251 |
+
primary = results[0]
|
| 252 |
+
|
| 253 |
+
# Count corroborating sources that report a title within sim ≥ _TITLE_AGREE
|
| 254 |
+
# of the primary's fetched_title.
|
| 255 |
+
primary_title = primary.fetched_title
|
| 256 |
+
agree_count = 0
|
| 257 |
+
distinct_sources = set()
|
| 258 |
+
for r in results:
|
| 259 |
+
if r is primary:
|
| 260 |
+
continue
|
| 261 |
+
if not r.fetched_title:
|
| 262 |
+
continue
|
| 263 |
+
if _title_sim(primary_title, r.fetched_title) >= _TITLE_AGREE:
|
| 264 |
+
agree_count += 1
|
| 265 |
+
distinct_sources.add(r.source)
|
| 266 |
+
|
| 267 |
+
# ------------------------------------------------------------------ stage 3: refine match decision
|
| 268 |
+
# Tighten / loosen `is_match` based on corroboration + year tolerance.
|
| 269 |
+
title_ok_tight = primary.title_similarity >= _TITLE_MATCH_TIGHT
|
| 270 |
+
year_ok_loose = _year_close(primary.bib_year, primary.fetched_year)
|
| 271 |
+
|
| 272 |
+
if agree_count >= 1 and title_ok_tight:
|
| 273 |
+
primary.is_match = True
|
| 274 |
+
elif title_ok_tight and primary.author_match and year_ok_loose:
|
| 275 |
+
primary.is_match = True
|
| 276 |
+
elif primary.is_match and not (title_ok_tight and year_ok_loose):
|
| 277 |
+
# Original heuristic said match but our stricter rule disagrees.
|
| 278 |
+
primary.is_match = False
|
| 279 |
+
if not any("stricter check" in i.lower() for i in primary.issues):
|
| 280 |
+
primary.issues.append(
|
| 281 |
+
"Marked unverified by stricter check (title/year tolerance not met)."
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Boost / annotate confidence with corroboration signal.
|
| 285 |
+
if agree_count >= 1:
|
| 286 |
+
# Each corroborating source bumps confidence toward 1.0.
|
| 287 |
+
bonus = min(0.25, 0.1 + 0.05 * agree_count)
|
| 288 |
+
primary.confidence = min(1.0, primary.confidence + bonus)
|
| 289 |
+
# Positive note — goes to `notes`, NOT `issues`. Otherwise verified
|
| 290 |
+
# entries would display a misleading "1 issue(s)" badge.
|
| 291 |
+
primary.notes.append(
|
| 292 |
+
f"Corroborated by {agree_count} other source(s): {', '.join(sorted(distinct_sources))}."
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
# Year-only mismatch with otherwise solid match: drop the hard issue
|
| 296 |
+
# and record a soft note instead (preprint/published year difference).
|
| 297 |
+
if (primary.title_match and primary.author_match and not primary.year_match
|
| 298 |
+
and year_ok_loose and primary.bib_year and primary.fetched_year):
|
| 299 |
+
primary.issues = [
|
| 300 |
+
i for i in primary.issues if not i.startswith("Year mismatch")
|
| 301 |
+
]
|
| 302 |
+
primary.notes.append(
|
| 303 |
+
f"Year differs by ≤1 ({primary.bib_year} vs {primary.fetched_year}) — "
|
| 304 |
+
"likely preprint/published difference, treated as match."
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
return primary
|
bibguard.yaml
CHANGED
|
@@ -27,6 +27,23 @@ files:
|
|
| 27 |
output_dir: "test"
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# ==============================================================================
|
| 31 |
# 🎓 Conference Template
|
| 32 |
# ==============================================================================
|
|
@@ -59,7 +76,7 @@ bibliography:
|
|
| 59 |
|
| 60 |
# Relevance Assessment - Use LLM to evaluate if citations match their context
|
| 61 |
# Requires LLM configuration (see llm section below). Disabled by default due to API costs.
|
| 62 |
-
check_relevance:
|
| 63 |
|
| 64 |
# ==============================================================================
|
| 65 |
# 📋 Submission Quality Checks
|
|
@@ -125,6 +142,21 @@ submission:
|
|
| 125 |
# Detects GitHub links, acknowledgments, self-citations that may reveal author identity
|
| 126 |
anonymization: true
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
# ==============================================================================
|
| 129 |
# 🔍 Metadata Check Workflow
|
| 130 |
# ==============================================================================
|
|
@@ -133,7 +165,7 @@ submission:
|
|
| 133 |
# Set enabled: false to skip a particular source.
|
| 134 |
workflow:
|
| 135 |
- name: arxiv_id
|
| 136 |
-
enabled:
|
| 137 |
description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
|
| 138 |
|
| 139 |
- name: crossref_doi
|
|
@@ -153,7 +185,7 @@ workflow:
|
|
| 153 |
description: "OpenAlex API (broad coverage across disciplines)"
|
| 154 |
|
| 155 |
- name: arxiv_title
|
| 156 |
-
enabled:
|
| 157 |
description: "Search arXiv by title (fallback when ID unavailable)"
|
| 158 |
|
| 159 |
- name: crossref_title
|
|
@@ -171,17 +203,18 @@ llm:
|
|
| 171 |
# Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
|
| 172 |
# Each backend requires different setup (API keys, local installation, etc.)
|
| 173 |
backend: "gemini"
|
| 174 |
-
|
| 175 |
# Model name (leave empty to use backend default)
|
| 176 |
-
# Examples: "gpt-
|
| 177 |
model: ""
|
| 178 |
|
| 179 |
# API endpoint (leave empty to use backend default)
|
| 180 |
# Only needed for self-hosted models (vllm, ollama) or custom endpoints
|
| 181 |
endpoint: ""
|
| 182 |
|
| 183 |
-
# API key (
|
| 184 |
-
# Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.
|
|
|
|
| 185 |
api_key: ""
|
| 186 |
|
| 187 |
# ==============================================================================
|
|
|
|
| 27 |
output_dir: "test"
|
| 28 |
|
| 29 |
|
| 30 |
+
# ==============================================================================
|
| 31 |
+
# 🌐 Network / Politeness
|
| 32 |
+
# ==============================================================================
|
| 33 |
+
network:
|
| 34 |
+
# Real email used in User-Agent for arXiv/CrossRef/OpenAlex polite-pool requests.
|
| 35 |
+
# arXiv's robots policy asks for a real contact. Strongly recommended to fill in.
|
| 36 |
+
contact_email: ""
|
| 37 |
+
|
| 38 |
+
# Cache HTTP responses to a local SQLite DB. Same `entry.key` won't re-hit network
|
| 39 |
+
# within the TTL window. Hugely speeds up re-runs.
|
| 40 |
+
cache_enabled: true
|
| 41 |
+
cache_ttl_hours: 24
|
| 42 |
+
|
| 43 |
+
# Auto-retry on 429/5xx with exponential backoff.
|
| 44 |
+
retry_total: 5
|
| 45 |
+
retry_backoff_factor: 1.5
|
| 46 |
+
|
| 47 |
# ==============================================================================
|
| 48 |
# 🎓 Conference Template
|
| 49 |
# ==============================================================================
|
|
|
|
| 76 |
|
| 77 |
# Relevance Assessment - Use LLM to evaluate if citations match their context
|
| 78 |
# Requires LLM configuration (see llm section below). Disabled by default due to API costs.
|
| 79 |
+
check_relevance: true
|
| 80 |
|
| 81 |
# ==============================================================================
|
| 82 |
# 📋 Submission Quality Checks
|
|
|
|
| 142 |
# Detects GitHub links, acknowledgments, self-citations that may reveal author identity
|
| 143 |
anonymization: true
|
| 144 |
|
| 145 |
+
# ==============================================================================
|
| 146 |
+
# 🌐 Network-Bound Bibliography Checks
|
| 147 |
+
# ==============================================================================
|
| 148 |
+
# These run only when explicitly enabled. Both operate solely on bib entries
|
| 149 |
+
# that carry the relevant field (no DOI ⇒ retraction skipped, no url= ⇒
|
| 150 |
+
# liveness skipped). The web UI's "Strict" preset turns both on.
|
| 151 |
+
submission_extra:
|
| 152 |
+
# URL Liveness - HEAD-then-GET every entry.url to find dead links.
|
| 153 |
+
# Slow on large bibs (one HTTP roundtrip per URL); off by default.
|
| 154 |
+
url_liveness: false
|
| 155 |
+
|
| 156 |
+
# Retractions - Look up every entry.doi against CrossRef's update-to relation
|
| 157 |
+
# to flag retracted, withdrawn, or "expression of concern" papers.
|
| 158 |
+
retraction: true
|
| 159 |
+
|
| 160 |
# ==============================================================================
|
| 161 |
# 🔍 Metadata Check Workflow
|
| 162 |
# ==============================================================================
|
|
|
|
| 165 |
# Set enabled: false to skip a particular source.
|
| 166 |
workflow:
|
| 167 |
- name: arxiv_id
|
| 168 |
+
enabled: false
|
| 169 |
description: "Lookup by arXiv ID (fastest, most reliable for preprints)"
|
| 170 |
|
| 171 |
- name: crossref_doi
|
|
|
|
| 185 |
description: "OpenAlex API (broad coverage across disciplines)"
|
| 186 |
|
| 187 |
- name: arxiv_title
|
| 188 |
+
enabled: false
|
| 189 |
description: "Search arXiv by title (fallback when ID unavailable)"
|
| 190 |
|
| 191 |
- name: crossref_title
|
|
|
|
| 203 |
# Backend provider: ollama, vllm, gemini, openai, anthropic, deepseek
|
| 204 |
# Each backend requires different setup (API keys, local installation, etc.)
|
| 205 |
backend: "gemini"
|
| 206 |
+
|
| 207 |
# Model name (leave empty to use backend default)
|
| 208 |
+
# Examples: "gpt-4o-mini", "claude-haiku-4-5-20251001", "gemini-2.5-flash", "llama3"
|
| 209 |
model: ""
|
| 210 |
|
| 211 |
# API endpoint (leave empty to use backend default)
|
| 212 |
# Only needed for self-hosted models (vllm, ollama) or custom endpoints
|
| 213 |
endpoint: ""
|
| 214 |
|
| 215 |
+
# API key (RECOMMENDED: leave empty and use environment variables instead)
|
| 216 |
+
# Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY, etc.
|
| 217 |
+
# in your shell. BibGuard will read from $<BACKEND>_API_KEY automatically.
|
| 218 |
api_key: ""
|
| 219 |
|
| 220 |
# ==============================================================================
|
main.py
CHANGED
|
@@ -7,8 +7,12 @@ Usage:
|
|
| 7 |
python main.py --config my.yaml # Use specified config file
|
| 8 |
python main.py --init # Create default config file
|
| 9 |
python main.py --list-templates # List available templates
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
import argparse
|
|
|
|
| 12 |
import sys
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Optional, List
|
|
@@ -19,10 +23,17 @@ from src.analyzers import MetadataComparator, UsageChecker, LLMEvaluator, Duplic
|
|
| 19 |
from src.analyzers.llm_evaluator import LLMBackend
|
| 20 |
from src.report.generator import ReportGenerator, EntryReport
|
| 21 |
from src.utils.progress import ProgressDisplay
|
|
|
|
|
|
|
|
|
|
| 22 |
from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
|
| 23 |
from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
|
| 24 |
from src.templates.base_template import get_template, get_all_templates
|
| 25 |
from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def main():
|
|
@@ -52,8 +63,24 @@ Usage Examples:
|
|
| 52 |
action="store_true",
|
| 53 |
help="List all available conference templates"
|
| 54 |
)
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
args = parser.parse_args()
|
|
|
|
| 57 |
|
| 58 |
# Handle --init
|
| 59 |
if args.init:
|
|
@@ -95,25 +122,43 @@ Usage Examples:
|
|
| 95 |
print(f"Error: Failed to parse config file: {e}")
|
| 96 |
sys.exit(1)
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
# Validate required fields
|
| 99 |
mode_dir = bool(config.files.input_dir)
|
| 100 |
-
|
| 101 |
if mode_dir:
|
| 102 |
input_dir = config.input_dir_path
|
| 103 |
if not input_dir.exists() or not input_dir.is_dir():
|
| 104 |
print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
|
| 105 |
sys.exit(1)
|
| 106 |
-
|
| 107 |
tex_files = list(input_dir.rglob("*.tex"))
|
| 108 |
bib_files = list(input_dir.rglob("*.bib"))
|
| 109 |
-
|
| 110 |
if not tex_files:
|
| 111 |
print(f"Error: No .tex files found in {input_dir}")
|
| 112 |
sys.exit(1)
|
| 113 |
if not bib_files:
|
| 114 |
print(f"Error: No .bib files found in {input_dir}")
|
| 115 |
sys.exit(1)
|
| 116 |
-
|
| 117 |
config._tex_files = tex_files
|
| 118 |
config._bib_files = bib_files
|
| 119 |
else:
|
|
@@ -123,7 +168,7 @@ Usage Examples:
|
|
| 123 |
if not config.files.tex:
|
| 124 |
print("Error: tex file path not specified in config")
|
| 125 |
sys.exit(1)
|
| 126 |
-
|
| 127 |
# Validate files exist
|
| 128 |
if not config.bib_path.exists():
|
| 129 |
print(f"Error: Bib file does not exist: {config.bib_path}")
|
|
@@ -131,10 +176,29 @@ Usage Examples:
|
|
| 131 |
if not config.tex_path.exists():
|
| 132 |
print(f"Error: TeX file does not exist: {config.tex_path}")
|
| 133 |
sys.exit(1)
|
| 134 |
-
|
| 135 |
config._tex_files = [config.tex_path]
|
| 136 |
config._bib_files = [config.bib_path]
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
# Load template if specified
|
| 139 |
template = None
|
| 140 |
if config.template:
|
|
@@ -143,12 +207,12 @@ Usage Examples:
|
|
| 143 |
print(f"Error: Unknown template: {config.template}")
|
| 144 |
print("Use --list-templates to see available templates")
|
| 145 |
sys.exit(1)
|
| 146 |
-
|
| 147 |
# Run the checker
|
| 148 |
try:
|
| 149 |
run_checker(config, template)
|
| 150 |
except KeyboardInterrupt:
|
| 151 |
-
print("\n\
|
| 152 |
sys.exit(130)
|
| 153 |
except Exception as e:
|
| 154 |
print(f"\nError: {e}")
|
|
@@ -250,32 +314,62 @@ def run_checker(config: BibGuardConfig, template=None):
|
|
| 250 |
[str(f) for f in config._tex_files]
|
| 251 |
)
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
# Run submission quality checks
|
| 254 |
submission_results = []
|
| 255 |
-
enabled_checkers = config.submission.get_enabled_checkers()
|
| 256 |
-
|
|
|
|
|
|
|
| 257 |
for checker_name in enabled_checkers:
|
| 258 |
if checker_name in CHECKER_REGISTRY:
|
| 259 |
checker = CHECKER_REGISTRY[checker_name]()
|
| 260 |
for tex_path_str, content in tex_contents.items():
|
| 261 |
-
|
| 262 |
-
#
|
| 263 |
-
|
| 264 |
-
|
| 265 |
submission_results.extend(results)
|
| 266 |
-
|
| 267 |
# Set results in report generator for summary calculation
|
| 268 |
report_gen.set_submission_results(submission_results, template)
|
| 269 |
-
|
| 270 |
# Check for duplicates (silent)
|
| 271 |
if bib_config.check_duplicates and duplicate_detector:
|
| 272 |
duplicate_groups = duplicate_detector.find_duplicates(entries)
|
| 273 |
report_gen.set_duplicate_groups(duplicate_groups)
|
| 274 |
-
|
| 275 |
# Check missing citations (silent)
|
| 276 |
if bib_config.check_usage and usage_checker:
|
| 277 |
missing = usage_checker.get_missing_entries(entries)
|
| 278 |
report_gen.set_missing_citations(missing)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
# Process entries
|
| 281 |
|
|
@@ -347,41 +441,46 @@ def run_checker(config: BibGuardConfig, template=None):
|
|
| 347 |
# Determine number of workers (max 10 to avoid overwhelming APIs)
|
| 348 |
max_workers = min(10, len(entries))
|
| 349 |
|
|
|
|
| 350 |
with progress.progress_context(len(entries), "Processing bibliography") as prog:
|
| 351 |
# Use ThreadPoolExecutor for parallel processing
|
| 352 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 353 |
# Submit all tasks
|
| 354 |
future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
|
| 355 |
-
|
| 356 |
# Process completed tasks
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
prog.mark_error()
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
completed_count[0] += 1
|
| 382 |
-
prog.update(entry.key, "Failed", 1)
|
| 383 |
-
|
| 384 |
-
# Summary will be printed at the very end
|
| 385 |
|
| 386 |
# Generate reports and organize outputs (silent)
|
| 387 |
|
|
@@ -395,61 +494,55 @@ def run_checker(config: BibGuardConfig, template=None):
|
|
| 395 |
shutil.copy2(bib_path, output_dir / bib_path.name)
|
| 396 |
for tex_path in config._tex_files:
|
| 397 |
shutil.copy2(tex_path, output_dir / tex_path.name)
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
f.write("\n\n".join(all_line_reports))
|
| 430 |
-
|
| 431 |
-
# 4. Clean bib file (if generated earlier)
|
| 432 |
if bib_config.check_usage and usage_checker:
|
| 433 |
used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
|
| 434 |
if used_entries:
|
| 435 |
try:
|
| 436 |
keys_to_keep = {entry.key for entry in used_entries}
|
| 437 |
-
# If multiple bibs, we merge them into one cleaned file
|
| 438 |
-
# or just use the first one if it's single mode.
|
| 439 |
-
# For now, let's just use a default name if multiple.
|
| 440 |
if len(config._bib_files) == 1:
|
| 441 |
clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
|
| 442 |
bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
|
| 443 |
else:
|
| 444 |
clean_bib_path = output_dir / "merged_only_used.bib"
|
| 445 |
-
# We need a way to filter multiple files into one.
|
| 446 |
-
# BibParser.filter_file currently takes one input.
|
| 447 |
-
# Let's just write all used entries to a new file.
|
| 448 |
with open(clean_bib_path, 'w', encoding='utf-8') as f:
|
| 449 |
for entry in used_entries:
|
| 450 |
-
f.write(entry
|
| 451 |
except Exception as e:
|
| 452 |
-
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
# Print beautiful console summary
|
| 455 |
if not config.output.quiet:
|
|
@@ -461,85 +554,40 @@ def fetch_and_compare_with_workflow(
|
|
| 461 |
entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
|
| 462 |
semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
|
| 463 |
):
|
| 464 |
-
"""
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
ss_result = semantic_scholar_fetcher.fetch_by_doi(entry.doi)
|
| 487 |
-
if not ss_result:
|
| 488 |
-
ss_result = semantic_scholar_fetcher.search_by_title(entry.title)
|
| 489 |
-
if ss_result:
|
| 490 |
-
result = comparator.compare_with_semantic_scholar(entry, ss_result)
|
| 491 |
-
|
| 492 |
-
elif step.name == "dblp" and entry.title and dblp_fetcher:
|
| 493 |
-
dblp_result = dblp_fetcher.search_by_title(entry.title)
|
| 494 |
-
if dblp_result:
|
| 495 |
-
result = comparator.compare_with_dblp(entry, dblp_result)
|
| 496 |
-
|
| 497 |
-
elif step.name == "openalex" and entry.title and openalex_fetcher:
|
| 498 |
-
oa_result = None
|
| 499 |
-
if entry.doi:
|
| 500 |
-
oa_result = openalex_fetcher.fetch_by_doi(entry.doi)
|
| 501 |
-
if not oa_result:
|
| 502 |
-
oa_result = openalex_fetcher.search_by_title(entry.title)
|
| 503 |
-
if oa_result:
|
| 504 |
-
result = comparator.compare_with_openalex(entry, oa_result)
|
| 505 |
-
|
| 506 |
-
elif step.name == "arxiv_title" and entry.title and arxiv_fetcher:
|
| 507 |
-
results = arxiv_fetcher.search_by_title(entry.title, max_results=3)
|
| 508 |
-
if results:
|
| 509 |
-
best_result = None
|
| 510 |
-
best_sim = 0.0
|
| 511 |
-
norm1 = TextNormalizer.normalize_for_comparison(entry.title)
|
| 512 |
-
|
| 513 |
-
for r in results:
|
| 514 |
-
norm2 = TextNormalizer.normalize_for_comparison(r.title)
|
| 515 |
-
sim = TextNormalizer.similarity_ratio(norm1, norm2)
|
| 516 |
-
if sim > best_sim:
|
| 517 |
-
best_sim = sim
|
| 518 |
-
best_result = r
|
| 519 |
-
|
| 520 |
-
if best_result and best_sim > 0.5:
|
| 521 |
-
result = comparator.compare_with_arxiv(entry, best_result)
|
| 522 |
-
|
| 523 |
-
elif step.name == "crossref_title" and entry.title and crossref_fetcher:
|
| 524 |
-
crossref_result = crossref_fetcher.search_by_title(entry.title)
|
| 525 |
-
if crossref_result:
|
| 526 |
-
result = comparator.compare_with_crossref(entry, crossref_result)
|
| 527 |
-
|
| 528 |
-
elif step.name == "google_scholar" and entry.title and scholar_fetcher:
|
| 529 |
scholar_result = scholar_fetcher.search_by_title(entry.title)
|
| 530 |
if scholar_result:
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
return comparator.create_unable_result(entry, "Unable to find this paper in any data source")
|
| 543 |
|
| 544 |
|
| 545 |
def get_abstract(entry, comparison_result, arxiv_fetcher):
|
|
|
|
| 7 |
python main.py --config my.yaml # Use specified config file
|
| 8 |
python main.py --init # Create default config file
|
| 9 |
python main.py --list-templates # List available templates
|
| 10 |
+
python main.py --quick # Skip network-bound metadata/relevance/url checks
|
| 11 |
+
python main.py --format json,html,markdown
|
| 12 |
+
python main.py --verbose # DEBUG-level logs to stderr
|
| 13 |
"""
|
| 14 |
import argparse
|
| 15 |
+
import logging
|
| 16 |
import sys
|
| 17 |
from pathlib import Path
|
| 18 |
from typing import Optional, List
|
|
|
|
| 23 |
from src.analyzers.llm_evaluator import LLMBackend
|
| 24 |
from src.report.generator import ReportGenerator, EntryReport
|
| 25 |
from src.utils.progress import ProgressDisplay
|
| 26 |
+
from src.utils.logging_setup import setup as setup_logging
|
| 27 |
+
from src.utils import http as http_layer
|
| 28 |
+
from src.utils.validation import validate_bib, validate_tex, format_report
|
| 29 |
from src.config.yaml_config import BibGuardConfig, load_config, find_config_file, create_default_config
|
| 30 |
from src.config.workflow import WorkflowConfig, WorkflowStep as WFStep, get_default_workflow
|
| 31 |
from src.templates.base_template import get_template, get_all_templates
|
| 32 |
from src.checkers import CHECKER_REGISTRY, CheckResult, CheckSeverity
|
| 33 |
+
from src.checkers.retraction_checker import RetractionChecker
|
| 34 |
+
from src.checkers.url_checker import URLChecker
|
| 35 |
+
|
| 36 |
+
logger = logging.getLogger("bibguard")
|
| 37 |
|
| 38 |
|
| 39 |
def main():
|
|
|
|
| 63 |
action="store_true",
|
| 64 |
help="List all available conference templates"
|
| 65 |
)
|
| 66 |
+
parser.add_argument(
|
| 67 |
+
"--quick",
|
| 68 |
+
action="store_true",
|
| 69 |
+
help="Skip network-bound checks (metadata, retraction, URL liveness, LLM)",
|
| 70 |
+
)
|
| 71 |
+
parser.add_argument(
|
| 72 |
+
"--format",
|
| 73 |
+
default=None,
|
| 74 |
+
help="Comma-separated list of output formats (markdown, html, json). Defaults to config.",
|
| 75 |
+
)
|
| 76 |
+
parser.add_argument(
|
| 77 |
+
"--verbose", "-v",
|
| 78 |
+
action="store_true",
|
| 79 |
+
help="Verbose (DEBUG) logging to stderr",
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
args = parser.parse_args()
|
| 83 |
+
setup_logging("DEBUG" if args.verbose else None)
|
| 84 |
|
| 85 |
# Handle --init
|
| 86 |
if args.init:
|
|
|
|
| 122 |
print(f"Error: Failed to parse config file: {e}")
|
| 123 |
sys.exit(1)
|
| 124 |
|
| 125 |
+
# CLI overrides
|
| 126 |
+
if args.quick:
|
| 127 |
+
config.bibliography.check_metadata = False
|
| 128 |
+
config.bibliography.check_relevance = False
|
| 129 |
+
config.submission_extra.url_liveness = False
|
| 130 |
+
config.submission_extra.retraction = False
|
| 131 |
+
if args.format:
|
| 132 |
+
config.output.formats = [s.strip() for s in args.format.split(",") if s.strip()]
|
| 133 |
+
|
| 134 |
+
# Configure shared HTTP layer (retry + cache + UA)
|
| 135 |
+
http_layer.configure(
|
| 136 |
+
contact_email=config.network.contact_email,
|
| 137 |
+
cache_enabled=config.network.cache_enabled,
|
| 138 |
+
cache_ttl_hours=config.network.cache_ttl_hours,
|
| 139 |
+
retry_total=config.network.retry_total,
|
| 140 |
+
retry_backoff_factor=config.network.retry_backoff_factor,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
# Validate required fields
|
| 144 |
mode_dir = bool(config.files.input_dir)
|
| 145 |
+
|
| 146 |
if mode_dir:
|
| 147 |
input_dir = config.input_dir_path
|
| 148 |
if not input_dir.exists() or not input_dir.is_dir():
|
| 149 |
print(f"Error: Input directory does not exist or is not a directory: {input_dir}")
|
| 150 |
sys.exit(1)
|
| 151 |
+
|
| 152 |
tex_files = list(input_dir.rglob("*.tex"))
|
| 153 |
bib_files = list(input_dir.rglob("*.bib"))
|
| 154 |
+
|
| 155 |
if not tex_files:
|
| 156 |
print(f"Error: No .tex files found in {input_dir}")
|
| 157 |
sys.exit(1)
|
| 158 |
if not bib_files:
|
| 159 |
print(f"Error: No .bib files found in {input_dir}")
|
| 160 |
sys.exit(1)
|
| 161 |
+
|
| 162 |
config._tex_files = tex_files
|
| 163 |
config._bib_files = bib_files
|
| 164 |
else:
|
|
|
|
| 168 |
if not config.files.tex:
|
| 169 |
print("Error: tex file path not specified in config")
|
| 170 |
sys.exit(1)
|
| 171 |
+
|
| 172 |
# Validate files exist
|
| 173 |
if not config.bib_path.exists():
|
| 174 |
print(f"Error: Bib file does not exist: {config.bib_path}")
|
|
|
|
| 176 |
if not config.tex_path.exists():
|
| 177 |
print(f"Error: TeX file does not exist: {config.tex_path}")
|
| 178 |
sys.exit(1)
|
| 179 |
+
|
| 180 |
config._tex_files = [config.tex_path]
|
| 181 |
config._bib_files = [config.bib_path]
|
| 182 |
+
|
| 183 |
+
# Pre-flight content validation (R6)
|
| 184 |
+
any_fatal = False
|
| 185 |
+
for bp in config._bib_files:
|
| 186 |
+
rep = validate_bib(bp)
|
| 187 |
+
msg = format_report(rep, label=bp.name)
|
| 188 |
+
if msg:
|
| 189 |
+
print(msg)
|
| 190 |
+
if not rep.ok:
|
| 191 |
+
any_fatal = True
|
| 192 |
+
for tp in config._tex_files:
|
| 193 |
+
rep = validate_tex(tp)
|
| 194 |
+
msg = format_report(rep, label=tp.name)
|
| 195 |
+
if msg:
|
| 196 |
+
print(msg)
|
| 197 |
+
if not rep.ok:
|
| 198 |
+
any_fatal = True
|
| 199 |
+
if any_fatal:
|
| 200 |
+
sys.exit(1)
|
| 201 |
+
|
| 202 |
# Load template if specified
|
| 203 |
template = None
|
| 204 |
if config.template:
|
|
|
|
| 207 |
print(f"Error: Unknown template: {config.template}")
|
| 208 |
print("Use --list-templates to see available templates")
|
| 209 |
sys.exit(1)
|
| 210 |
+
|
| 211 |
# Run the checker
|
| 212 |
try:
|
| 213 |
run_checker(config, template)
|
| 214 |
except KeyboardInterrupt:
|
| 215 |
+
print("\n\n[BibGuard] Interrupted. Partial reports (if any) are in the output dir.")
|
| 216 |
sys.exit(130)
|
| 217 |
except Exception as e:
|
| 218 |
print(f"\nError: {e}")
|
|
|
|
| 314 |
[str(f) for f in config._tex_files]
|
| 315 |
)
|
| 316 |
|
| 317 |
+
# Build the per-checker config dict (glossary, template, etc.)
|
| 318 |
+
checker_config = {
|
| 319 |
+
"glossary_preferred": config.glossary.preferred,
|
| 320 |
+
"glossary_acronyms": config.glossary.acronyms,
|
| 321 |
+
"template": template,
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
# Run submission quality checks
|
| 325 |
submission_results = []
|
| 326 |
+
enabled_checkers = list(config.submission.get_enabled_checkers())
|
| 327 |
+
if template is not None and "template" not in enabled_checkers:
|
| 328 |
+
enabled_checkers.append("template")
|
| 329 |
+
|
| 330 |
for checker_name in enabled_checkers:
|
| 331 |
if checker_name in CHECKER_REGISTRY:
|
| 332 |
checker = CHECKER_REGISTRY[checker_name]()
|
| 333 |
for tex_path_str, content in tex_contents.items():
|
| 334 |
+
# Run the checker on this file. We deliberately do NOT tag
|
| 335 |
+
# `r.file_path = tex_path_str` because user-facing reports
|
| 336 |
+
# never expose local tex paths (basename or full).
|
| 337 |
+
results = checker.check(content, checker_config)
|
| 338 |
submission_results.extend(results)
|
| 339 |
+
|
| 340 |
# Set results in report generator for summary calculation
|
| 341 |
report_gen.set_submission_results(submission_results, template)
|
| 342 |
+
|
| 343 |
# Check for duplicates (silent)
|
| 344 |
if bib_config.check_duplicates and duplicate_detector:
|
| 345 |
duplicate_groups = duplicate_detector.find_duplicates(entries)
|
| 346 |
report_gen.set_duplicate_groups(duplicate_groups)
|
| 347 |
+
|
| 348 |
# Check missing citations (silent)
|
| 349 |
if bib_config.check_usage and usage_checker:
|
| 350 |
missing = usage_checker.get_missing_entries(entries)
|
| 351 |
report_gen.set_missing_citations(missing)
|
| 352 |
+
|
| 353 |
+
# Retraction lookups (F1)
|
| 354 |
+
if config.submission_extra.retraction:
|
| 355 |
+
try:
|
| 356 |
+
findings = RetractionChecker().check_entries(entries)
|
| 357 |
+
report_gen.set_retraction_findings(findings)
|
| 358 |
+
if findings:
|
| 359 |
+
logger.info("Retraction check found %d flagged entries", len(findings))
|
| 360 |
+
except Exception as e:
|
| 361 |
+
logger.debug("Retraction check failed: %s", e)
|
| 362 |
+
|
| 363 |
+
# URL liveness (F2)
|
| 364 |
+
if config.submission_extra.url_liveness:
|
| 365 |
+
try:
|
| 366 |
+
url_findings = URLChecker().check_entries(entries)
|
| 367 |
+
report_gen.set_url_findings(url_findings)
|
| 368 |
+
broken = sum(1 for f in url_findings if f.status != "ok")
|
| 369 |
+
if broken:
|
| 370 |
+
logger.info("URL liveness check: %d broken URL(s)", broken)
|
| 371 |
+
except Exception as e:
|
| 372 |
+
logger.debug("URL liveness check failed: %s", e)
|
| 373 |
|
| 374 |
# Process entries
|
| 375 |
|
|
|
|
| 441 |
# Determine number of workers (max 10 to avoid overwhelming APIs)
|
| 442 |
max_workers = min(10, len(entries))
|
| 443 |
|
| 444 |
+
interrupted = False
|
| 445 |
with progress.progress_context(len(entries), "Processing bibliography") as prog:
|
| 446 |
# Use ThreadPoolExecutor for parallel processing
|
| 447 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 448 |
# Submit all tasks
|
| 449 |
future_to_entry = {executor.submit(process_single_entry, entry): entry for entry in entries}
|
| 450 |
+
|
| 451 |
# Process completed tasks
|
| 452 |
+
try:
|
| 453 |
+
for future in as_completed(future_to_entry):
|
| 454 |
+
entry = future_to_entry[future]
|
| 455 |
+
try:
|
| 456 |
+
entry_report, comparison_result = future.result()
|
| 457 |
+
|
| 458 |
+
# Thread-safe progress update
|
| 459 |
+
with progress_lock:
|
| 460 |
+
report_gen.add_entry_report(entry_report)
|
| 461 |
+
|
| 462 |
+
# Update progress
|
| 463 |
+
if comparison_result and comparison_result.is_match:
|
| 464 |
+
prog.mark_success()
|
| 465 |
+
elif comparison_result and comparison_result.has_issues:
|
| 466 |
+
prog.mark_warning()
|
| 467 |
+
else:
|
| 468 |
+
prog.mark_error()
|
| 469 |
+
|
| 470 |
+
completed_count[0] += 1
|
| 471 |
+
prog.update(entry.key, "Done", 1)
|
| 472 |
+
|
| 473 |
+
except Exception as e:
|
| 474 |
+
with progress_lock:
|
| 475 |
prog.mark_error()
|
| 476 |
+
progress.print_error(f"Error processing {entry.key}: {e}")
|
| 477 |
+
completed_count[0] += 1
|
| 478 |
+
prog.update(entry.key, "Failed", 1)
|
| 479 |
+
except KeyboardInterrupt:
|
| 480 |
+
interrupted = True
|
| 481 |
+
logger.warning("Interrupted by user; cancelling remaining work and saving partial reports")
|
| 482 |
+
for f in future_to_entry:
|
| 483 |
+
f.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
# Generate reports and organize outputs (silent)
|
| 486 |
|
|
|
|
| 494 |
shutil.copy2(bib_path, output_dir / bib_path.name)
|
| 495 |
for tex_path in config._tex_files:
|
| 496 |
shutil.copy2(tex_path, output_dir / tex_path.name)
|
| 497 |
+
requested_formats = {f.lower() for f in (config.output.formats or ["markdown", "html"])}
|
| 498 |
+
|
| 499 |
+
# 1. Bibliography Report (markdown)
|
| 500 |
+
if "markdown" in requested_formats:
|
| 501 |
+
bib_report_path = output_dir / "bibliography_report.md"
|
| 502 |
+
report_gen.save_bibliography_report(str(bib_report_path))
|
| 503 |
+
|
| 504 |
+
# 2. LaTeX Quality Report (markdown)
|
| 505 |
+
if submission_results:
|
| 506 |
+
latex_report_path = output_dir / "latex_quality_report.md"
|
| 507 |
+
report_gen.save_latex_quality_report(
|
| 508 |
+
str(latex_report_path),
|
| 509 |
+
submission_results,
|
| 510 |
+
template,
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
# 4. Self-contained HTML (★)
|
| 514 |
+
if "html" in requested_formats:
|
| 515 |
+
try:
|
| 516 |
+
report_gen.save_html(str(output_dir / "report.html"))
|
| 517 |
+
except Exception as e:
|
| 518 |
+
logger.warning("Failed to write HTML report: %s", e)
|
| 519 |
+
|
| 520 |
+
# 5. JSON output
|
| 521 |
+
if "json" in requested_formats:
|
| 522 |
+
try:
|
| 523 |
+
report_gen.save_json(str(output_dir / "report.json"))
|
| 524 |
+
except Exception as e:
|
| 525 |
+
logger.warning("Failed to write JSON report: %s", e)
|
| 526 |
+
|
| 527 |
+
# 6. Clean bib file (if generated earlier)
|
|
|
|
|
|
|
|
|
|
| 528 |
if bib_config.check_usage and usage_checker:
|
| 529 |
used_entries = [er.entry for er in report_gen.entries if er.usage and er.usage.is_used]
|
| 530 |
if used_entries:
|
| 531 |
try:
|
| 532 |
keys_to_keep = {entry.key for entry in used_entries}
|
|
|
|
|
|
|
|
|
|
| 533 |
if len(config._bib_files) == 1:
|
| 534 |
clean_bib_path = output_dir / f"{config._bib_files[0].stem}_only_used.bib"
|
| 535 |
bib_parser.filter_file(str(config._bib_files[0]), str(clean_bib_path), keys_to_keep)
|
| 536 |
else:
|
| 537 |
clean_bib_path = output_dir / "merged_only_used.bib"
|
|
|
|
|
|
|
|
|
|
| 538 |
with open(clean_bib_path, 'w', encoding='utf-8') as f:
|
| 539 |
for entry in used_entries:
|
| 540 |
+
f.write(getattr(entry, "raw", "") + "\n\n")
|
| 541 |
except Exception as e:
|
| 542 |
+
logger.debug("Failed to write cleaned bib file: %s", e)
|
| 543 |
+
|
| 544 |
+
if interrupted:
|
| 545 |
+
print("[BibGuard] Saved partial reports for completed entries.")
|
| 546 |
|
| 547 |
# Print beautiful console summary
|
| 548 |
if not config.output.quiet:
|
|
|
|
| 554 |
entry, workflow_config, arxiv_fetcher, crossref_fetcher, scholar_fetcher,
|
| 555 |
semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator
|
| 556 |
):
|
| 557 |
+
"""
|
| 558 |
+
Fetch metadata across all configured sources and pick the best match.
|
| 559 |
+
|
| 560 |
+
Delegates the heavy lifting to ``app_helper.fetch_and_compare_with_workflow``,
|
| 561 |
+
which runs identifier-based and title-based lookups in parallel and uses
|
| 562 |
+
cross-source corroboration to decide is_match. Google Scholar is consulted
|
| 563 |
+
only as a last-resort fallback because scraping is fragile and frequently
|
| 564 |
+
blocked.
|
| 565 |
+
"""
|
| 566 |
+
from app_helper import fetch_and_compare_with_workflow as _parallel_lookup
|
| 567 |
+
|
| 568 |
+
primary = _parallel_lookup(
|
| 569 |
+
entry, workflow_config, arxiv_fetcher, crossref_fetcher,
|
| 570 |
+
semantic_scholar_fetcher, openalex_fetcher, dblp_fetcher, comparator,
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
if primary and primary.source != "unable":
|
| 574 |
+
return primary
|
| 575 |
+
|
| 576 |
+
# Last-resort Google Scholar fallback (web scraping; frequently blocked).
|
| 577 |
+
if entry.title and scholar_fetcher:
|
| 578 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
scholar_result = scholar_fetcher.search_by_title(entry.title)
|
| 580 |
if scholar_result:
|
| 581 |
+
return comparator.compare_with_scholar(entry, scholar_result)
|
| 582 |
+
except Exception as e:
|
| 583 |
+
logger.warning(
|
| 584 |
+
"Google Scholar fallback failed for entry=%s: %s",
|
| 585 |
+
getattr(entry, "key", "<unknown>"), e, exc_info=True,
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
return primary or comparator.create_unable_result(
|
| 589 |
+
entry, "Unable to find this paper in any data source"
|
| 590 |
+
)
|
|
|
|
|
|
|
| 591 |
|
| 592 |
|
| 593 |
def get_abstract(entry, comparison_result, arxiv_fetcher):
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
gradio>=6.0.0
|
| 2 |
bibtexparser>=1.4.0
|
| 3 |
requests>=2.31.0
|
|
|
|
| 4 |
beautifulsoup4>=4.12.0
|
| 5 |
rich>=13.7.0
|
| 6 |
Unidecode>=1.3.0
|
|
|
|
| 1 |
gradio>=6.0.0
|
| 2 |
bibtexparser>=1.4.0
|
| 3 |
requests>=2.31.0
|
| 4 |
+
requests-cache>=1.2.0
|
| 5 |
beautifulsoup4>=4.12.0
|
| 6 |
rich>=13.7.0
|
| 7 |
Unidecode>=1.3.0
|
scripts/install-hook.sh
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Install a git pre-commit hook that runs BibGuard in --quick mode whenever
|
| 3 |
+
# the staged changes touch .bib or .tex files.
|
| 4 |
+
#
|
| 5 |
+
# Usage (run from the repo root that contains your paper, NOT BibGuard's repo):
|
| 6 |
+
# bash /path/to/BibGuard/scripts/install-hook.sh
|
| 7 |
+
#
|
| 8 |
+
# Skip the hook for one commit: git commit --no-verify
|
| 9 |
+
set -euo pipefail
|
| 10 |
+
|
| 11 |
+
if ! git rev-parse --git-dir >/dev/null 2>&1; then
|
| 12 |
+
echo "Error: not inside a git repo." >&2
|
| 13 |
+
exit 1
|
| 14 |
+
fi
|
| 15 |
+
|
| 16 |
+
HOOK_DIR="$(git rev-parse --git-dir)/hooks"
|
| 17 |
+
HOOK="$HOOK_DIR/pre-commit"
|
| 18 |
+
|
| 19 |
+
# Locate BibGuard's main.py — we assume this script lives in BibGuard/scripts/.
|
| 20 |
+
BIBGUARD_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
| 21 |
+
MAIN_PY="$BIBGUARD_DIR/main.py"
|
| 22 |
+
if [[ ! -f "$MAIN_PY" ]]; then
|
| 23 |
+
echo "Error: cannot locate BibGuard main.py at $MAIN_PY" >&2
|
| 24 |
+
exit 1
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
mkdir -p "$HOOK_DIR"
|
| 28 |
+
|
| 29 |
+
if [[ -f "$HOOK" ]]; then
|
| 30 |
+
echo "A pre-commit hook already exists at $HOOK"
|
| 31 |
+
echo "Backing it up to $HOOK.bibguard-backup"
|
| 32 |
+
mv "$HOOK" "$HOOK.bibguard-backup"
|
| 33 |
+
fi
|
| 34 |
+
|
| 35 |
+
cat >"$HOOK" <<EOF
|
| 36 |
+
#!/usr/bin/env bash
|
| 37 |
+
# BibGuard pre-commit hook (auto-generated)
|
| 38 |
+
# Runs only if staged files include .tex or .bib.
|
| 39 |
+
set -e
|
| 40 |
+
|
| 41 |
+
if git diff --cached --name-only --diff-filter=ACM | grep -qE '\.(tex|bib)$'; then
|
| 42 |
+
echo "[BibGuard] Running quick checks on staged paper sources…"
|
| 43 |
+
python "$MAIN_PY" --quick || {
|
| 44 |
+
echo
|
| 45 |
+
echo "[BibGuard] Issues found. Fix or run: git commit --no-verify to skip."
|
| 46 |
+
exit 1
|
| 47 |
+
}
|
| 48 |
+
fi
|
| 49 |
+
EOF
|
| 50 |
+
|
| 51 |
+
chmod +x "$HOOK"
|
| 52 |
+
echo "Installed BibGuard pre-commit hook at: $HOOK"
|
| 53 |
+
echo "It will run only when staged files include .tex or .bib."
|
src/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (202 Bytes)
|
|
|
src/__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (190 Bytes)
|
|
|
src/analyzers/__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (464 Bytes)
|
|
|
src/analyzers/__pycache__/duplicate_detector.cpython-313.pyc
DELETED
|
Binary file (8.29 kB)
|
|
|
src/analyzers/__pycache__/field_completeness_checker.cpython-313.pyc
DELETED
|
Binary file (5.4 kB)
|
|
|
src/analyzers/__pycache__/llm_evaluator.cpython-313.pyc
DELETED
|
Binary file (14.3 kB)
|
|
|
src/analyzers/__pycache__/metadata_comparator.cpython-313.pyc
DELETED
|
Binary file (18.9 kB)
|
|
|
src/analyzers/__pycache__/retraction_checker.cpython-313.pyc
DELETED
|
Binary file (4.94 kB)
|
|
|
src/analyzers/__pycache__/url_validator.cpython-313.pyc
DELETED
|
Binary file (8.3 kB)
|
|
|
src/analyzers/__pycache__/usage_checker.cpython-313.pyc
DELETED
|
Binary file (4.4 kB)
|
|
|
src/analyzers/__pycache__/venue_normalizer.cpython-313.pyc
DELETED
|
Binary file (13.3 kB)
|
|
|
src/analyzers/llm_evaluator.py
CHANGED
|
@@ -3,14 +3,18 @@ LLM-based citation relevance evaluator.
|
|
| 3 |
Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
|
| 4 |
"""
|
| 5 |
import json
|
|
|
|
| 6 |
import re
|
| 7 |
-
|
| 8 |
-
from
|
|
|
|
| 9 |
from enum import Enum
|
| 10 |
import os
|
| 11 |
|
| 12 |
import requests
|
| 13 |
|
|
|
|
|
|
|
| 14 |
|
| 15 |
class LLMBackend(Enum):
|
| 16 |
OPENAI = "openai"
|
|
@@ -21,6 +25,52 @@ class LLMBackend(Enum):
|
|
| 21 |
DEEPSEEK = "deepseek"
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
@dataclass
|
| 25 |
class EvaluationResult:
|
| 26 |
"""Result of LLM citation evaluation."""
|
|
@@ -30,15 +80,16 @@ class EvaluationResult:
|
|
| 30 |
explanation: str
|
| 31 |
context_used: str
|
| 32 |
abstract_used: str
|
|
|
|
| 33 |
line_number: Optional[int] = None
|
| 34 |
file_path: Optional[str] = None
|
| 35 |
error: Optional[str] = None
|
| 36 |
-
|
| 37 |
@property
|
| 38 |
def score_label(self) -> str:
|
| 39 |
labels = {
|
| 40 |
1: "Not Relevant",
|
| 41 |
-
2: "Marginally Relevant",
|
| 42 |
3: "Somewhat Relevant",
|
| 43 |
4: "Relevant",
|
| 44 |
5: "Highly Relevant"
|
|
@@ -49,7 +100,7 @@ class EvaluationResult:
|
|
| 49 |
class LLMEvaluator:
|
| 50 |
"""Evaluates citation relevance using LLM."""
|
| 51 |
|
| 52 |
-
PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant.
|
| 53 |
|
| 54 |
## Citation Context (from the manuscript):
|
| 55 |
{context}
|
|
@@ -62,23 +113,28 @@ Evaluate the relevance and appropriateness of this citation. Consider:
|
|
| 62 |
1. Does the citation support the claim being made in the context?
|
| 63 |
2. Is the cited paper's topic related to the discussion?
|
| 64 |
3. Is this citation necessary, or could it be replaced with a more relevant one?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
## Response Format:
|
| 67 |
-
|
| 68 |
{{
|
| 69 |
-
"relevance_score": <1-5
|
| 70 |
-
"is_relevant": <true
|
| 71 |
-
"
|
|
|
|
| 72 |
}}
|
| 73 |
|
| 74 |
-
Score guide:
|
| 75 |
-
|
| 76 |
-
- 2: Marginally relevant
|
| 77 |
-
- 3: Somewhat relevant
|
| 78 |
-
- 4: Relevant and appropriate
|
| 79 |
-
- 5: Highly relevant and essential
|
| 80 |
-
|
| 81 |
-
STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text."""
|
| 82 |
|
| 83 |
def __init__(
|
| 84 |
self,
|
|
@@ -90,28 +146,32 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
|
|
| 90 |
self.backend = backend
|
| 91 |
self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
|
| 92 |
|
| 93 |
-
# Set defaults based on backend
|
| 94 |
if backend == LLMBackend.OPENAI:
|
| 95 |
self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
|
| 96 |
-
self.model = model or "gpt-
|
| 97 |
elif backend == LLMBackend.ANTHROPIC:
|
| 98 |
self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
|
| 99 |
-
self.model = model or "claude-4
|
| 100 |
elif backend == LLMBackend.DEEPSEEK:
|
| 101 |
self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
|
| 102 |
self.model = model or "deepseek-chat"
|
| 103 |
elif backend == LLMBackend.OLLAMA:
|
| 104 |
self.endpoint = endpoint or "http://localhost:11434/api/generate"
|
| 105 |
-
self.model = model or "
|
| 106 |
elif backend == LLMBackend.VLLM:
|
| 107 |
self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
|
| 108 |
-
self.model = model or "Qwen/
|
| 109 |
elif backend == LLMBackend.GEMINI:
|
| 110 |
self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
|
| 111 |
-
self.model = model or "gemini-2.5-flash
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
|
| 114 |
-
"""Evaluate citation relevance."""
|
| 115 |
if not context or not abstract:
|
| 116 |
return EvaluationResult(
|
| 117 |
entry_key=entry_key,
|
|
@@ -122,34 +182,51 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
|
|
| 122 |
abstract_used=abstract,
|
| 123 |
error="Missing context or abstract for evaluation"
|
| 124 |
)
|
| 125 |
-
|
| 126 |
-
# Don't truncate - preserve full context and abstract
|
| 127 |
prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
def _call_openai_compatible(self, prompt: str) -> str:
|
| 155 |
"""Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
|
|
@@ -272,24 +349,77 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
|
|
| 272 |
return parts[0].get("text", "")
|
| 273 |
return ""
|
| 274 |
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
try:
|
| 289 |
-
|
|
|
|
|
|
|
| 290 |
except json.JSONDecodeError:
|
| 291 |
pass
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
if not data:
|
| 294 |
return EvaluationResult(
|
| 295 |
entry_key=entry_key,
|
|
@@ -301,27 +431,44 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
|
|
| 301 |
error="Failed to parse LLM response as JSON"
|
| 302 |
)
|
| 303 |
|
| 304 |
-
# Extract fields
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
is_relevant = data.get("is_relevant",
|
| 313 |
if isinstance(is_relevant, str):
|
| 314 |
-
is_relevant = is_relevant.lower() in ("true", "yes", "1")
|
| 315 |
-
|
| 316 |
-
explanation = data.get("explanation", "")
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
return EvaluationResult(
|
| 319 |
entry_key=entry_key,
|
| 320 |
relevance_score=relevance_score,
|
| 321 |
is_relevant=is_relevant,
|
| 322 |
explanation=explanation,
|
| 323 |
context_used=context,
|
| 324 |
-
abstract_used=abstract
|
|
|
|
| 325 |
)
|
| 326 |
|
| 327 |
def test_connection(self) -> bool:
|
|
@@ -371,6 +518,7 @@ STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other tex
|
|
| 371 |
}
|
| 372 |
response = requests.post(url, json=payload, timeout=10)
|
| 373 |
return response.status_code == 200
|
| 374 |
-
except Exception:
|
|
|
|
| 375 |
return False
|
| 376 |
return False
|
|
|
|
| 3 |
Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends.
|
| 4 |
"""
|
| 5 |
import json
|
| 6 |
+
import logging
|
| 7 |
import re
|
| 8 |
+
import time
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import Optional, Dict, Any, Tuple
|
| 11 |
from enum import Enum
|
| 12 |
import os
|
| 13 |
|
| 14 |
import requests
|
| 15 |
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
|
| 19 |
class LLMBackend(Enum):
|
| 20 |
OPENAI = "openai"
|
|
|
|
| 25 |
DEEPSEEK = "deepseek"
|
| 26 |
|
| 27 |
|
| 28 |
+
# Map backend → environment variable name for the API key.
|
| 29 |
+
_BACKEND_ENV = {
|
| 30 |
+
LLMBackend.OPENAI: "OPENAI_API_KEY",
|
| 31 |
+
LLMBackend.ANTHROPIC: "ANTHROPIC_API_KEY",
|
| 32 |
+
LLMBackend.GEMINI: "GEMINI_API_KEY",
|
| 33 |
+
LLMBackend.DEEPSEEK: "DEEPSEEK_API_KEY",
|
| 34 |
+
LLMBackend.VLLM: "VLLM_API_KEY",
|
| 35 |
+
LLMBackend.OLLAMA: "", # local, no key
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Order in which we auto-detect a usable backend when the user hasn't picked
|
| 39 |
+
# one explicitly. Cheapest/fastest first.
|
| 40 |
+
_AUTODETECT_ORDER = [
|
| 41 |
+
LLMBackend.GEMINI,
|
| 42 |
+
LLMBackend.OPENAI,
|
| 43 |
+
LLMBackend.DEEPSEEK,
|
| 44 |
+
LLMBackend.ANTHROPIC,
|
| 45 |
+
LLMBackend.OLLAMA,
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def autodetect_backend() -> Optional[Tuple[LLMBackend, str]]:
|
| 50 |
+
"""
|
| 51 |
+
Find the first backend that has credentials in the environment.
|
| 52 |
+
|
| 53 |
+
Returns (backend, api_key) or None. For Ollama we attempt a localhost
|
| 54 |
+
probe so users with `ollama serve` running get auto-selected with no
|
| 55 |
+
config.
|
| 56 |
+
"""
|
| 57 |
+
for backend in _AUTODETECT_ORDER:
|
| 58 |
+
env = _BACKEND_ENV.get(backend, "")
|
| 59 |
+
if env:
|
| 60 |
+
key = os.environ.get(env, "").strip()
|
| 61 |
+
if key:
|
| 62 |
+
return backend, key
|
| 63 |
+
elif backend == LLMBackend.OLLAMA:
|
| 64 |
+
# Local probe — small timeout so absence isn't painful.
|
| 65 |
+
try:
|
| 66 |
+
r = requests.get("http://localhost:11434/api/tags", timeout=1.0)
|
| 67 |
+
if r.status_code == 200:
|
| 68 |
+
return backend, ""
|
| 69 |
+
except requests.RequestException:
|
| 70 |
+
continue
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
@dataclass
|
| 75 |
class EvaluationResult:
|
| 76 |
"""Result of LLM citation evaluation."""
|
|
|
|
| 80 |
explanation: str
|
| 81 |
context_used: str
|
| 82 |
abstract_used: str
|
| 83 |
+
citation_role: str = "" # baseline | method | dataset | counterexample | survey | motivation | other
|
| 84 |
line_number: Optional[int] = None
|
| 85 |
file_path: Optional[str] = None
|
| 86 |
error: Optional[str] = None
|
| 87 |
+
|
| 88 |
@property
|
| 89 |
def score_label(self) -> str:
|
| 90 |
labels = {
|
| 91 |
1: "Not Relevant",
|
| 92 |
+
2: "Marginally Relevant",
|
| 93 |
3: "Somewhat Relevant",
|
| 94 |
4: "Relevant",
|
| 95 |
5: "Highly Relevant"
|
|
|
|
| 100 |
class LLMEvaluator:
|
| 101 |
"""Evaluates citation relevance using LLM."""
|
| 102 |
|
| 103 |
+
PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant, and identify the citation's role in the manuscript.
|
| 104 |
|
| 105 |
## Citation Context (from the manuscript):
|
| 106 |
{context}
|
|
|
|
| 113 |
1. Does the citation support the claim being made in the context?
|
| 114 |
2. Is the cited paper's topic related to the discussion?
|
| 115 |
3. Is this citation necessary, or could it be replaced with a more relevant one?
|
| 116 |
+
4. What is the *role* of this citation in the manuscript?
|
| 117 |
+
|
| 118 |
+
## Citation roles (pick exactly one):
|
| 119 |
+
- "baseline": cited paper is used/compared as a baseline or prior method.
|
| 120 |
+
- "method": cited paper introduces a method that the manuscript builds on or uses directly.
|
| 121 |
+
- "dataset": cited paper provides a dataset/benchmark the manuscript uses.
|
| 122 |
+
- "counterexample": cited to show a contrary finding or argue against.
|
| 123 |
+
- "survey": cited as a survey/overview reference.
|
| 124 |
+
- "motivation": cited to motivate the problem (background, application, statistics).
|
| 125 |
+
- "other": none of the above clearly applies.
|
| 126 |
|
| 127 |
## Response Format:
|
| 128 |
+
Respond with ONE JSON object, no other text:
|
| 129 |
{{
|
| 130 |
+
"relevance_score": <integer 1-5>,
|
| 131 |
+
"is_relevant": <true|false>,
|
| 132 |
+
"citation_role": "<one of: baseline|method|dataset|counterexample|survey|motivation|other>",
|
| 133 |
+
"explanation": "<1-2 sentences>"
|
| 134 |
}}
|
| 135 |
|
| 136 |
+
Score guide: 1=Not relevant, 2=Marginally, 3=Somewhat, 4=Relevant, 5=Highly relevant.
|
| 137 |
+
STRICTLY FOLLOW THE JSON FORMAT."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
def __init__(
|
| 140 |
self,
|
|
|
|
| 146 |
self.backend = backend
|
| 147 |
self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY")
|
| 148 |
|
| 149 |
+
# Set defaults based on backend (cheap, fast models that exist)
|
| 150 |
if backend == LLMBackend.OPENAI:
|
| 151 |
self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions"
|
| 152 |
+
self.model = model or "gpt-4o-mini"
|
| 153 |
elif backend == LLMBackend.ANTHROPIC:
|
| 154 |
self.endpoint = endpoint or "https://api.anthropic.com/v1/messages"
|
| 155 |
+
self.model = model or "claude-haiku-4-5-20251001"
|
| 156 |
elif backend == LLMBackend.DEEPSEEK:
|
| 157 |
self.endpoint = endpoint or "https://api.deepseek.com/chat/completions"
|
| 158 |
self.model = model or "deepseek-chat"
|
| 159 |
elif backend == LLMBackend.OLLAMA:
|
| 160 |
self.endpoint = endpoint or "http://localhost:11434/api/generate"
|
| 161 |
+
self.model = model or "qwen2.5:3b-instruct"
|
| 162 |
elif backend == LLMBackend.VLLM:
|
| 163 |
self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions"
|
| 164 |
+
self.model = model or "Qwen/Qwen2.5-3B-Instruct"
|
| 165 |
elif backend == LLMBackend.GEMINI:
|
| 166 |
self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models"
|
| 167 |
+
self.model = model or "gemini-2.5-flash"
|
| 168 |
|
| 169 |
+
# Retry config for transient LLM failures (rate limits, server errors, JSON issues).
|
| 170 |
+
MAX_ATTEMPTS = 3
|
| 171 |
+
RETRY_BASE_DELAY = 1.5 # seconds, exponential
|
| 172 |
+
|
| 173 |
def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult:
|
| 174 |
+
"""Evaluate citation relevance with retries on transient errors."""
|
| 175 |
if not context or not abstract:
|
| 176 |
return EvaluationResult(
|
| 177 |
entry_key=entry_key,
|
|
|
|
| 182 |
abstract_used=abstract,
|
| 183 |
error="Missing context or abstract for evaluation"
|
| 184 |
)
|
| 185 |
+
|
|
|
|
| 186 |
prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract)
|
| 187 |
+
|
| 188 |
+
last_err: Optional[str] = None
|
| 189 |
+
for attempt in range(1, self.MAX_ATTEMPTS + 1):
|
| 190 |
+
try:
|
| 191 |
+
if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM):
|
| 192 |
+
response = self._call_openai_compatible(prompt)
|
| 193 |
+
elif self.backend == LLMBackend.ANTHROPIC:
|
| 194 |
+
response = self._call_anthropic(prompt)
|
| 195 |
+
elif self.backend == LLMBackend.OLLAMA:
|
| 196 |
+
response = self._call_ollama(prompt)
|
| 197 |
+
elif self.backend == LLMBackend.GEMINI:
|
| 198 |
+
response = self._call_gemini(prompt)
|
| 199 |
+
else:
|
| 200 |
+
raise ValueError(f"Unknown backend: {self.backend}")
|
| 201 |
+
|
| 202 |
+
parsed = self._parse_response(entry_key, response, context, abstract)
|
| 203 |
+
# Successful structured parse → return.
|
| 204 |
+
if parsed.error is None:
|
| 205 |
+
return parsed
|
| 206 |
+
# JSON parse failed — retry with the same prompt; LLM jitter
|
| 207 |
+
# often resolves on a second pass.
|
| 208 |
+
last_err = parsed.error
|
| 209 |
+
except requests.exceptions.RequestException as e:
|
| 210 |
+
last_err = f"network: {e}"
|
| 211 |
+
# Transient: retry with backoff.
|
| 212 |
+
except Exception as e:
|
| 213 |
+
last_err = str(e)
|
| 214 |
+
|
| 215 |
+
if attempt < self.MAX_ATTEMPTS:
|
| 216 |
+
delay = self.RETRY_BASE_DELAY * (2 ** (attempt - 1))
|
| 217 |
+
logger.debug("LLM attempt %d/%d failed (%s); retrying in %.1fs",
|
| 218 |
+
attempt, self.MAX_ATTEMPTS, last_err, delay)
|
| 219 |
+
time.sleep(delay)
|
| 220 |
+
|
| 221 |
+
return EvaluationResult(
|
| 222 |
+
entry_key=entry_key,
|
| 223 |
+
relevance_score=0,
|
| 224 |
+
is_relevant=False,
|
| 225 |
+
explanation="",
|
| 226 |
+
context_used=context,
|
| 227 |
+
abstract_used=abstract,
|
| 228 |
+
error=last_err or "Unknown error after retries"
|
| 229 |
+
)
|
| 230 |
|
| 231 |
def _call_openai_compatible(self, prompt: str) -> str:
|
| 232 |
"""Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM)."""
|
|
|
|
| 349 |
return parts[0].get("text", "")
|
| 350 |
return ""
|
| 351 |
|
| 352 |
+
@staticmethod
|
| 353 |
+
def _extract_json_object(text: str) -> Optional[dict]:
|
| 354 |
+
"""
|
| 355 |
+
Robust JSON extraction. Handles:
|
| 356 |
+
- bare JSON
|
| 357 |
+
- fenced ```json ... ``` blocks
|
| 358 |
+
- JSON embedded in surrounding prose
|
| 359 |
+
- nested objects (the simple `\\{[^{}]*\\}` regex misses these)
|
| 360 |
+
"""
|
| 361 |
+
if not text:
|
| 362 |
+
return None
|
| 363 |
+
s = text.strip()
|
| 364 |
+
|
| 365 |
+
# Direct parse
|
| 366 |
+
try:
|
| 367 |
+
obj = json.loads(s)
|
| 368 |
+
if isinstance(obj, dict):
|
| 369 |
+
return obj
|
| 370 |
+
except json.JSONDecodeError:
|
| 371 |
+
pass
|
| 372 |
+
|
| 373 |
+
# Strip Markdown code fences (```json ... ``` or ``` ... ```)
|
| 374 |
+
fence_match = re.search(r"```(?:json)?\s*(.*?)```", s, re.DOTALL | re.IGNORECASE)
|
| 375 |
+
if fence_match:
|
| 376 |
+
inner = fence_match.group(1).strip()
|
| 377 |
try:
|
| 378 |
+
obj = json.loads(inner)
|
| 379 |
+
if isinstance(obj, dict):
|
| 380 |
+
return obj
|
| 381 |
except json.JSONDecodeError:
|
| 382 |
pass
|
| 383 |
+
s = inner # fall through to brace-balance scan on inner
|
| 384 |
+
|
| 385 |
+
# Brace-balanced scan: find the first complete top-level {...}.
|
| 386 |
+
start = s.find("{")
|
| 387 |
+
while start != -1:
|
| 388 |
+
depth = 0
|
| 389 |
+
in_str = False
|
| 390 |
+
esc = False
|
| 391 |
+
for i in range(start, len(s)):
|
| 392 |
+
ch = s[i]
|
| 393 |
+
if esc:
|
| 394 |
+
esc = False
|
| 395 |
+
continue
|
| 396 |
+
if ch == "\\":
|
| 397 |
+
esc = True
|
| 398 |
+
continue
|
| 399 |
+
if ch == '"':
|
| 400 |
+
in_str = not in_str
|
| 401 |
+
continue
|
| 402 |
+
if in_str:
|
| 403 |
+
continue
|
| 404 |
+
if ch == "{":
|
| 405 |
+
depth += 1
|
| 406 |
+
elif ch == "}":
|
| 407 |
+
depth -= 1
|
| 408 |
+
if depth == 0:
|
| 409 |
+
chunk = s[start:i + 1]
|
| 410 |
+
try:
|
| 411 |
+
obj = json.loads(chunk)
|
| 412 |
+
if isinstance(obj, dict):
|
| 413 |
+
return obj
|
| 414 |
+
except json.JSONDecodeError:
|
| 415 |
+
break
|
| 416 |
+
start = s.find("{", start + 1)
|
| 417 |
+
return None
|
| 418 |
+
|
| 419 |
+
def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult:
|
| 420 |
+
"""Parse LLM response with robust JSON extraction."""
|
| 421 |
+
data = self._extract_json_object(response) or {}
|
| 422 |
+
|
| 423 |
if not data:
|
| 424 |
return EvaluationResult(
|
| 425 |
entry_key=entry_key,
|
|
|
|
| 431 |
error="Failed to parse LLM response as JSON"
|
| 432 |
)
|
| 433 |
|
| 434 |
+
# Extract & validate fields
|
| 435 |
+
raw_score = data.get("relevance_score", data.get("score", 0))
|
| 436 |
+
try:
|
| 437 |
+
relevance_score = int(float(raw_score))
|
| 438 |
+
except (TypeError, ValueError):
|
| 439 |
+
relevance_score = 0
|
| 440 |
+
relevance_score = max(0, min(5, relevance_score))
|
| 441 |
+
|
| 442 |
+
is_relevant = data.get("is_relevant", relevance_score >= 4)
|
| 443 |
if isinstance(is_relevant, str):
|
| 444 |
+
is_relevant = is_relevant.strip().lower() in ("true", "yes", "1", "y")
|
| 445 |
+
|
| 446 |
+
explanation = str(data.get("explanation", data.get("reason", ""))).strip()
|
| 447 |
+
citation_role = str(data.get("citation_role", data.get("role", ""))).strip().lower() or "other"
|
| 448 |
+
if citation_role not in {"baseline", "method", "dataset", "counterexample", "survey", "motivation", "other"}:
|
| 449 |
+
citation_role = "other"
|
| 450 |
+
|
| 451 |
+
# Sanity: a score of 0 means the LLM didn't actually return one — flag it.
|
| 452 |
+
if relevance_score == 0:
|
| 453 |
+
return EvaluationResult(
|
| 454 |
+
entry_key=entry_key,
|
| 455 |
+
relevance_score=0,
|
| 456 |
+
is_relevant=False,
|
| 457 |
+
explanation=explanation or response,
|
| 458 |
+
context_used=context,
|
| 459 |
+
abstract_used=abstract,
|
| 460 |
+
citation_role=citation_role,
|
| 461 |
+
error="LLM did not return a usable relevance_score",
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
return EvaluationResult(
|
| 465 |
entry_key=entry_key,
|
| 466 |
relevance_score=relevance_score,
|
| 467 |
is_relevant=is_relevant,
|
| 468 |
explanation=explanation,
|
| 469 |
context_used=context,
|
| 470 |
+
abstract_used=abstract,
|
| 471 |
+
citation_role=citation_role,
|
| 472 |
)
|
| 473 |
|
| 474 |
def test_connection(self) -> bool:
|
|
|
|
| 518 |
}
|
| 519 |
response = requests.post(url, json=payload, timeout=10)
|
| 520 |
return response.status_code == 200
|
| 521 |
+
except Exception as e:
|
| 522 |
+
logger.debug("LLM test_connection failed for %s: %s", self.backend.value, e)
|
| 523 |
return False
|
| 524 |
return False
|
src/analyzers/metadata_comparator.py
CHANGED
|
@@ -18,30 +18,41 @@ from ..utils.normalizer import TextNormalizer
|
|
| 18 |
class ComparisonResult:
|
| 19 |
"""Result of comparing bib entry with fetched metadata."""
|
| 20 |
entry_key: str
|
| 21 |
-
|
| 22 |
# Title comparison
|
| 23 |
title_match: bool
|
| 24 |
title_similarity: float
|
| 25 |
bib_title: str
|
| 26 |
fetched_title: str
|
| 27 |
-
|
| 28 |
# Author comparison
|
| 29 |
author_match: bool
|
| 30 |
author_similarity: float
|
| 31 |
bib_authors: list[str]
|
| 32 |
fetched_authors: list[str]
|
| 33 |
-
|
| 34 |
# Year comparison
|
| 35 |
year_match: bool
|
| 36 |
bib_year: str
|
| 37 |
fetched_year: str
|
| 38 |
-
|
| 39 |
# Overall assessment
|
| 40 |
is_match: bool
|
| 41 |
confidence: float
|
| 42 |
issues: list[str]
|
| 43 |
source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
@property
|
| 46 |
def has_issues(self) -> bool:
|
| 47 |
return len(self.issues) > 0
|
|
@@ -60,7 +71,17 @@ class MetadataComparator:
|
|
| 60 |
def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
|
| 61 |
"""Compare bib entry with arXiv metadata."""
|
| 62 |
issues = []
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# Compare titles
|
| 65 |
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 66 |
arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
|
|
@@ -114,7 +135,8 @@ class MetadataComparator:
|
|
| 114 |
is_match=is_match,
|
| 115 |
confidence=confidence,
|
| 116 |
issues=issues,
|
| 117 |
-
source="arxiv"
|
|
|
|
| 118 |
)
|
| 119 |
|
| 120 |
def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
|
|
|
|
| 18 |
class ComparisonResult:
|
| 19 |
"""Result of comparing bib entry with fetched metadata."""
|
| 20 |
entry_key: str
|
| 21 |
+
|
| 22 |
# Title comparison
|
| 23 |
title_match: bool
|
| 24 |
title_similarity: float
|
| 25 |
bib_title: str
|
| 26 |
fetched_title: str
|
| 27 |
+
|
| 28 |
# Author comparison
|
| 29 |
author_match: bool
|
| 30 |
author_similarity: float
|
| 31 |
bib_authors: list[str]
|
| 32 |
fetched_authors: list[str]
|
| 33 |
+
|
| 34 |
# Year comparison
|
| 35 |
year_match: bool
|
| 36 |
bib_year: str
|
| 37 |
fetched_year: str
|
| 38 |
+
|
| 39 |
# Overall assessment
|
| 40 |
is_match: bool
|
| 41 |
confidence: float
|
| 42 |
issues: list[str]
|
| 43 |
source: str # 'arxiv', 'crossref', 'scholar', 'semantic_scholar', 'openalex', 'dblp', or 'unable'
|
| 44 |
+
|
| 45 |
+
# F4: When an arXiv preprint has a published counterpart, surface it here.
|
| 46 |
+
published_version_hint: str = "" # e.g. "Also published at NeurIPS 2024 (doi:10.1145/...)"
|
| 47 |
+
|
| 48 |
+
# Positive / informational notes that should NOT be counted as issues
|
| 49 |
+
# (e.g. "corroborated by S2", "year differs by ≤1, treated as match").
|
| 50 |
+
notes: list[str] = None # type: ignore[assignment]
|
| 51 |
+
|
| 52 |
+
def __post_init__(self):
|
| 53 |
+
if self.notes is None:
|
| 54 |
+
self.notes = []
|
| 55 |
+
|
| 56 |
@property
|
| 57 |
def has_issues(self) -> bool:
|
| 58 |
return len(self.issues) > 0
|
|
|
|
| 71 |
def compare_with_arxiv(self, bib_entry: BibEntry, arxiv_meta: ArxivMetadata) -> ComparisonResult:
|
| 72 |
"""Compare bib entry with arXiv metadata."""
|
| 73 |
issues = []
|
| 74 |
+
|
| 75 |
+
# F4: Extract a published-version hint if arXiv records it.
|
| 76 |
+
published_hint = ""
|
| 77 |
+
if arxiv_meta.journal_ref or arxiv_meta.doi:
|
| 78 |
+
parts = []
|
| 79 |
+
if arxiv_meta.journal_ref:
|
| 80 |
+
parts.append(arxiv_meta.journal_ref.strip())
|
| 81 |
+
if arxiv_meta.doi:
|
| 82 |
+
parts.append(f"doi:{arxiv_meta.doi.strip()}")
|
| 83 |
+
published_hint = "Has a published version — " + " | ".join(parts)
|
| 84 |
+
|
| 85 |
# Compare titles
|
| 86 |
bib_title_norm = self.normalizer.normalize_for_comparison(bib_entry.title)
|
| 87 |
arxiv_title_norm = self.normalizer.normalize_for_comparison(arxiv_meta.title)
|
|
|
|
| 135 |
is_match=is_match,
|
| 136 |
confidence=confidence,
|
| 137 |
issues=issues,
|
| 138 |
+
source="arxiv",
|
| 139 |
+
published_version_hint=published_hint,
|
| 140 |
)
|
| 141 |
|
| 142 |
def compare_with_scholar(self, bib_entry: BibEntry, scholar_result: ScholarResult) -> ComparisonResult:
|
src/checkers/__init__.py
CHANGED
|
@@ -11,6 +11,7 @@ from .consistency_checker import ConsistencyChecker
|
|
| 11 |
from .citation_quality_checker import CitationQualityChecker
|
| 12 |
from .equation_checker import EquationChecker
|
| 13 |
from .acronym_checker import AcronymChecker
|
|
|
|
| 14 |
|
| 15 |
__all__ = [
|
| 16 |
'BaseChecker',
|
|
@@ -27,6 +28,7 @@ __all__ = [
|
|
| 27 |
'CitationQualityChecker',
|
| 28 |
'EquationChecker',
|
| 29 |
'AcronymChecker',
|
|
|
|
| 30 |
]
|
| 31 |
|
| 32 |
|
|
@@ -43,6 +45,7 @@ CHECKER_REGISTRY = {
|
|
| 43 |
'citation_quality': CitationQualityChecker,
|
| 44 |
'equation': EquationChecker,
|
| 45 |
'acronym': AcronymChecker,
|
|
|
|
| 46 |
}
|
| 47 |
|
| 48 |
|
|
|
|
| 11 |
from .citation_quality_checker import CitationQualityChecker
|
| 12 |
from .equation_checker import EquationChecker
|
| 13 |
from .acronym_checker import AcronymChecker
|
| 14 |
+
from .template_checker import TemplateChecker
|
| 15 |
|
| 16 |
__all__ = [
|
| 17 |
'BaseChecker',
|
|
|
|
| 28 |
'CitationQualityChecker',
|
| 29 |
'EquationChecker',
|
| 30 |
'AcronymChecker',
|
| 31 |
+
'TemplateChecker',
|
| 32 |
]
|
| 33 |
|
| 34 |
|
|
|
|
| 45 |
'citation_quality': CitationQualityChecker,
|
| 46 |
'equation': EquationChecker,
|
| 47 |
'acronym': AcronymChecker,
|
| 48 |
+
'template': TemplateChecker,
|
| 49 |
}
|
| 50 |
|
| 51 |
|
src/checkers/__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (2.2 kB)
|
|
|
src/checkers/__pycache__/acronym_checker.cpython-313.pyc
DELETED
|
Binary file (10.8 kB)
|
|
|
src/checkers/__pycache__/ai_artifacts_checker.cpython-313.pyc
DELETED
|
Binary file (6.14 kB)
|
|
|
src/checkers/__pycache__/anonymization_checker.cpython-313.pyc
DELETED
|
Binary file (8.38 kB)
|
|
|
src/checkers/__pycache__/base.cpython-313.pyc
DELETED
|
Binary file (7.68 kB)
|
|
|
src/checkers/__pycache__/caption_checker.cpython-313.pyc
DELETED
|
Binary file (5.63 kB)
|
|
|
src/checkers/__pycache__/citation_quality_checker.cpython-313.pyc
DELETED
|
Binary file (5.41 kB)
|
|
|
src/checkers/__pycache__/consistency_checker.cpython-313.pyc
DELETED
|
Binary file (11 kB)
|
|
|
src/checkers/__pycache__/equation_checker.cpython-313.pyc
DELETED
|
Binary file (5.62 kB)
|
|
|
src/checkers/__pycache__/formatting_checker.cpython-313.pyc
DELETED
|
Binary file (9.45 kB)
|
|
|
src/checkers/__pycache__/number_checker.cpython-313.pyc
DELETED
|
Binary file (3.8 kB)
|
|
|
src/checkers/__pycache__/reference_checker.cpython-313.pyc
DELETED
|
Binary file (8.3 kB)
|
|
|
src/checkers/__pycache__/sentence_checker.cpython-313.pyc
DELETED
|
Binary file (4.36 kB)
|
|
|
src/checkers/acronym_checker.py
CHANGED
|
@@ -87,23 +87,30 @@ class AcronymChecker(BaseChecker):
|
|
| 87 |
}
|
| 88 |
|
| 89 |
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
|
|
|
| 90 |
results = []
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
# Remove comments using base class method
|
| 93 |
content = self._remove_comments(tex_content)
|
| 94 |
-
|
| 95 |
# Find all defined acronyms with their positions
|
| 96 |
defined_acronyms = self._find_definitions(content)
|
| 97 |
-
|
| 98 |
# Find all acronym usages (excluding special contexts)
|
| 99 |
all_usages = self._find_all_usages(content)
|
| 100 |
-
|
| 101 |
# NEW: Find potential full forms for each acronym
|
| 102 |
acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
|
| 103 |
-
|
| 104 |
# Check for undefined acronyms (only those with matching full forms)
|
| 105 |
for acronym, positions in all_usages.items():
|
| 106 |
-
if acronym in
|
| 107 |
continue
|
| 108 |
|
| 109 |
# Skip if no matching full form found in document
|
|
|
|
| 87 |
}
|
| 88 |
|
| 89 |
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 90 |
+
config = config or {}
|
| 91 |
results = []
|
| 92 |
+
|
| 93 |
+
# Project glossary: skip-set + auto-defined map
|
| 94 |
+
user_acronyms = dict(config.get('glossary_acronyms', {}) or {})
|
| 95 |
+
# All user-supplied acronyms are considered "known/defined" — never warn about them.
|
| 96 |
+
glossary_skip = {k.upper() for k in user_acronyms.keys()}
|
| 97 |
+
common_plus_glossary = self.COMMON_ACRONYMS | glossary_skip
|
| 98 |
+
|
| 99 |
# Remove comments using base class method
|
| 100 |
content = self._remove_comments(tex_content)
|
| 101 |
+
|
| 102 |
# Find all defined acronyms with their positions
|
| 103 |
defined_acronyms = self._find_definitions(content)
|
| 104 |
+
|
| 105 |
# Find all acronym usages (excluding special contexts)
|
| 106 |
all_usages = self._find_all_usages(content)
|
| 107 |
+
|
| 108 |
# NEW: Find potential full forms for each acronym
|
| 109 |
acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
|
| 110 |
+
|
| 111 |
# Check for undefined acronyms (only those with matching full forms)
|
| 112 |
for acronym, positions in all_usages.items():
|
| 113 |
+
if acronym in common_plus_glossary:
|
| 114 |
continue
|
| 115 |
|
| 116 |
# Skip if no matching full form found in document
|
src/checkers/ai_artifacts_checker.py
CHANGED
|
@@ -125,7 +125,7 @@ class AIArtifactsChecker(BaseChecker):
|
|
| 125 |
severity=CheckSeverity.ERROR,
|
| 126 |
message=f"{description} detected",
|
| 127 |
line_number=line_num,
|
| 128 |
-
line_content=line.strip()
|
| 129 |
suggestion="Remove AI-generated conversational text"
|
| 130 |
))
|
| 131 |
break # One match per line for this category
|
|
@@ -139,7 +139,7 @@ class AIArtifactsChecker(BaseChecker):
|
|
| 139 |
severity=CheckSeverity.WARNING,
|
| 140 |
message=f"{description}: '{match.group(0)[:50]}'",
|
| 141 |
line_number=line_num,
|
| 142 |
-
line_content=line.strip()
|
| 143 |
suggestion="Replace placeholder with actual content or remove"
|
| 144 |
))
|
| 145 |
|
|
@@ -169,7 +169,7 @@ class AIArtifactsChecker(BaseChecker):
|
|
| 169 |
severity=CheckSeverity.INFO,
|
| 170 |
message=f"Possible {description} in LaTeX",
|
| 171 |
line_number=line_num,
|
| 172 |
-
line_content=line.strip()
|
| 173 |
suggestion="Convert to LaTeX formatting or remove if unintentional"
|
| 174 |
))
|
| 175 |
|
|
|
|
| 125 |
severity=CheckSeverity.ERROR,
|
| 126 |
message=f"{description} detected",
|
| 127 |
line_number=line_num,
|
| 128 |
+
line_content=line.strip(),
|
| 129 |
suggestion="Remove AI-generated conversational text"
|
| 130 |
))
|
| 131 |
break # One match per line for this category
|
|
|
|
| 139 |
severity=CheckSeverity.WARNING,
|
| 140 |
message=f"{description}: '{match.group(0)[:50]}'",
|
| 141 |
line_number=line_num,
|
| 142 |
+
line_content=line.strip(),
|
| 143 |
suggestion="Replace placeholder with actual content or remove"
|
| 144 |
))
|
| 145 |
|
|
|
|
| 169 |
severity=CheckSeverity.INFO,
|
| 170 |
message=f"Possible {description} in LaTeX",
|
| 171 |
line_number=line_num,
|
| 172 |
+
line_content=line.strip(),
|
| 173 |
suggestion="Convert to LaTeX formatting or remove if unintentional"
|
| 174 |
))
|
| 175 |
|
src/checkers/anonymization_checker.py
CHANGED
|
@@ -79,7 +79,7 @@ class AnonymizationChecker(BaseChecker):
|
|
| 79 |
severity=CheckSeverity.WARNING,
|
| 80 |
message=f"{desc} in comment (could be revealed when compiling)",
|
| 81 |
line_number=line_num,
|
| 82 |
-
line_content=line.strip()
|
| 83 |
suggestion="Remove or anonymize URL even in comments"
|
| 84 |
))
|
| 85 |
continue
|
|
@@ -91,7 +91,7 @@ class AnonymizationChecker(BaseChecker):
|
|
| 91 |
severity=CheckSeverity.ERROR,
|
| 92 |
message=f"{desc} may reveal author identity",
|
| 93 |
line_number=line_num,
|
| 94 |
-
line_content=line.strip()
|
| 95 |
suggestion="Replace with anonymized URL or remove for review"
|
| 96 |
))
|
| 97 |
|
|
@@ -112,7 +112,7 @@ class AnonymizationChecker(BaseChecker):
|
|
| 112 |
severity=CheckSeverity.WARNING,
|
| 113 |
message="Potentially self-revealing citation pattern",
|
| 114 |
line_number=line_num,
|
| 115 |
-
line_content=line.strip()
|
| 116 |
suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
|
| 117 |
))
|
| 118 |
|
|
|
|
| 79 |
severity=CheckSeverity.WARNING,
|
| 80 |
message=f"{desc} in comment (could be revealed when compiling)",
|
| 81 |
line_number=line_num,
|
| 82 |
+
line_content=line.strip(),
|
| 83 |
suggestion="Remove or anonymize URL even in comments"
|
| 84 |
))
|
| 85 |
continue
|
|
|
|
| 91 |
severity=CheckSeverity.ERROR,
|
| 92 |
message=f"{desc} may reveal author identity",
|
| 93 |
line_number=line_num,
|
| 94 |
+
line_content=line.strip(),
|
| 95 |
suggestion="Replace with anonymized URL or remove for review"
|
| 96 |
))
|
| 97 |
|
|
|
|
| 112 |
severity=CheckSeverity.WARNING,
|
| 113 |
message="Potentially self-revealing citation pattern",
|
| 114 |
line_number=line_num,
|
| 115 |
+
line_content=line.strip(),
|
| 116 |
suggestion="Rephrase to avoid revealing authorship (e.g., 'Prior work shows...')"
|
| 117 |
))
|
| 118 |
|
src/checkers/base.py
CHANGED
|
@@ -29,7 +29,10 @@ class CheckResult:
|
|
| 29 |
line_content: Optional[str] = None
|
| 30 |
suggestion: Optional[str] = None
|
| 31 |
file_path: Optional[str] = None
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
| 33 |
def to_dict(self) -> dict:
|
| 34 |
return {
|
| 35 |
'checker': self.checker_name,
|
|
@@ -39,7 +42,8 @@ class CheckResult:
|
|
| 39 |
'line': self.line_number,
|
| 40 |
'content': self.line_content,
|
| 41 |
'suggestion': self.suggestion,
|
| 42 |
-
'file_path': self.file_path
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
|
|
@@ -178,7 +182,8 @@ class BaseChecker(ABC):
|
|
| 178 |
message: str,
|
| 179 |
line_number: Optional[int] = None,
|
| 180 |
line_content: Optional[str] = None,
|
| 181 |
-
suggestion: Optional[str] = None
|
|
|
|
| 182 |
) -> CheckResult:
|
| 183 |
"""Helper to create a CheckResult with this checker's name."""
|
| 184 |
return CheckResult(
|
|
@@ -188,6 +193,7 @@ class BaseChecker(ABC):
|
|
| 188 |
message=message,
|
| 189 |
line_number=line_number,
|
| 190 |
line_content=line_content,
|
| 191 |
-
suggestion=suggestion
|
|
|
|
| 192 |
)
|
| 193 |
|
|
|
|
| 29 |
line_content: Optional[str] = None
|
| 30 |
suggestion: Optional[str] = None
|
| 31 |
file_path: Optional[str] = None
|
| 32 |
+
# Substring of line_content that triggered the issue. The renderer wraps
|
| 33 |
+
# this in <mark> so the user can see *where* in the line to look.
|
| 34 |
+
match_text: Optional[str] = None
|
| 35 |
+
|
| 36 |
def to_dict(self) -> dict:
|
| 37 |
return {
|
| 38 |
'checker': self.checker_name,
|
|
|
|
| 42 |
'line': self.line_number,
|
| 43 |
'content': self.line_content,
|
| 44 |
'suggestion': self.suggestion,
|
| 45 |
+
'file_path': self.file_path,
|
| 46 |
+
'match_text': self.match_text,
|
| 47 |
}
|
| 48 |
|
| 49 |
|
|
|
|
| 182 |
message: str,
|
| 183 |
line_number: Optional[int] = None,
|
| 184 |
line_content: Optional[str] = None,
|
| 185 |
+
suggestion: Optional[str] = None,
|
| 186 |
+
match_text: Optional[str] = None,
|
| 187 |
) -> CheckResult:
|
| 188 |
"""Helper to create a CheckResult with this checker's name."""
|
| 189 |
return CheckResult(
|
|
|
|
| 193 |
message=message,
|
| 194 |
line_number=line_number,
|
| 195 |
line_content=line_content,
|
| 196 |
+
suggestion=suggestion,
|
| 197 |
+
match_text=match_text,
|
| 198 |
)
|
| 199 |
|
src/checkers/citation_quality_checker.py
CHANGED
|
@@ -124,7 +124,7 @@ class CitationQualityChecker(BaseChecker):
|
|
| 124 |
severity=CheckSeverity.WARNING,
|
| 125 |
message="Appears to be hardcoded citation instead of \\cite",
|
| 126 |
line_number=line_num,
|
| 127 |
-
line_content=line.strip()
|
| 128 |
suggestion="Use \\cite{} for proper bibliography management"
|
| 129 |
))
|
| 130 |
|
|
|
|
| 124 |
severity=CheckSeverity.WARNING,
|
| 125 |
message="Appears to be hardcoded citation instead of \\cite",
|
| 126 |
line_number=line_num,
|
| 127 |
+
line_content=line.strip(),
|
| 128 |
suggestion="Use \\cite{} for proper bibliography management"
|
| 129 |
))
|
| 130 |
|
src/checkers/consistency_checker.py
CHANGED
|
@@ -149,25 +149,45 @@ class ConsistencyChecker(BaseChecker):
|
|
| 149 |
]
|
| 150 |
|
| 151 |
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
|
|
|
| 152 |
results = []
|
| 153 |
-
|
| 154 |
# Remove comments
|
| 155 |
content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
|
| 156 |
content_lower = content.lower()
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# Check for known variant inconsistencies
|
| 159 |
-
for canonical, variants in
|
| 160 |
found_forms = []
|
| 161 |
-
|
| 162 |
# Check canonical form
|
| 163 |
if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
|
| 164 |
found_forms.append(canonical)
|
| 165 |
-
|
| 166 |
# Check variants
|
| 167 |
for variant in variants:
|
| 168 |
if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
|
| 169 |
found_forms.append(variant)
|
| 170 |
-
|
| 171 |
if len(found_forms) > 1:
|
| 172 |
results.append(self._create_result(
|
| 173 |
passed=False,
|
|
|
|
| 149 |
]
|
| 150 |
|
| 151 |
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 152 |
+
config = config or {}
|
| 153 |
results = []
|
| 154 |
+
|
| 155 |
# Remove comments
|
| 156 |
content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
|
| 157 |
content_lower = content.lower()
|
| 158 |
+
|
| 159 |
+
# Merge user glossary preferred terms into the variant table
|
| 160 |
+
glossary_preferred = list(config.get('glossary_preferred', []))
|
| 161 |
+
merged_variants = dict(self.KNOWN_VARIANTS)
|
| 162 |
+
for term in glossary_preferred:
|
| 163 |
+
term = (term or "").strip()
|
| 164 |
+
if not term:
|
| 165 |
+
continue
|
| 166 |
+
# Generate plausible variants: hyphen ↔ space ↔ collapsed; lower vs canonical
|
| 167 |
+
forms = {term}
|
| 168 |
+
if "-" in term:
|
| 169 |
+
forms.add(term.replace("-", " "))
|
| 170 |
+
forms.add(term.replace("-", ""))
|
| 171 |
+
if " " in term:
|
| 172 |
+
forms.add(term.replace(" ", "-"))
|
| 173 |
+
forms.add(term.replace(" ", ""))
|
| 174 |
+
forms.discard(term)
|
| 175 |
+
if forms:
|
| 176 |
+
merged_variants.setdefault(term, []).extend(sorted(forms))
|
| 177 |
+
|
| 178 |
# Check for known variant inconsistencies
|
| 179 |
+
for canonical, variants in merged_variants.items():
|
| 180 |
found_forms = []
|
| 181 |
+
|
| 182 |
# Check canonical form
|
| 183 |
if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
|
| 184 |
found_forms.append(canonical)
|
| 185 |
+
|
| 186 |
# Check variants
|
| 187 |
for variant in variants:
|
| 188 |
if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
|
| 189 |
found_forms.append(variant)
|
| 190 |
+
|
| 191 |
if len(found_forms) > 1:
|
| 192 |
results.append(self._create_result(
|
| 193 |
passed=False,
|
src/checkers/formatting_checker.py
CHANGED
|
@@ -41,9 +41,6 @@ class FormattingChecker(BaseChecker):
|
|
| 41 |
'^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
|
| 42 |
}
|
| 43 |
|
| 44 |
-
# Multiple blank lines pattern (3 or more blank lines)
|
| 45 |
-
MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
|
| 46 |
-
|
| 47 |
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 48 |
results = []
|
| 49 |
lines = tex_content.split('\n')
|
|
@@ -66,8 +63,9 @@ class FormattingChecker(BaseChecker):
|
|
| 66 |
severity=CheckSeverity.INFO,
|
| 67 |
message="Citation without non-breaking space",
|
| 68 |
line_number=line_num,
|
| 69 |
-
line_content=line.strip()
|
| 70 |
-
suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
|
|
|
|
| 71 |
))
|
| 72 |
|
| 73 |
# Track citation styles
|
|
@@ -90,40 +88,6 @@ class FormattingChecker(BaseChecker):
|
|
| 90 |
suggestion="Consider using consistent citation style throughout"
|
| 91 |
))
|
| 92 |
|
| 93 |
-
# Check for multiple blank lines (3 or more)
|
| 94 |
-
for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
|
| 95 |
-
line_num = self._find_line_number(tex_content, match.start())
|
| 96 |
-
# Count how many blank lines
|
| 97 |
-
blank_count = match.group(0).count('\n') - 1
|
| 98 |
-
|
| 99 |
-
# Get context: the line before, blank lines, and the line after
|
| 100 |
-
start_pos = match.start()
|
| 101 |
-
end_pos = match.end()
|
| 102 |
-
|
| 103 |
-
# Find the line before the blank lines
|
| 104 |
-
prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
|
| 105 |
-
prev_line_end = start_pos
|
| 106 |
-
prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
|
| 107 |
-
|
| 108 |
-
# Find the line after the blank lines
|
| 109 |
-
next_line_end = tex_content.find('\n', end_pos)
|
| 110 |
-
if next_line_end == -1:
|
| 111 |
-
next_line_end = len(tex_content)
|
| 112 |
-
next_line = tex_content[end_pos:next_line_end].rstrip()
|
| 113 |
-
|
| 114 |
-
# Create visual representation with warning markers
|
| 115 |
-
blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
|
| 116 |
-
line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
|
| 117 |
-
|
| 118 |
-
results.append(self._create_result(
|
| 119 |
-
passed=False,
|
| 120 |
-
severity=CheckSeverity.INFO,
|
| 121 |
-
message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
|
| 122 |
-
line_number=line_num,
|
| 123 |
-
line_content=line_content,
|
| 124 |
-
suggestion="Reduce to single blank line or use \\vspace"
|
| 125 |
-
))
|
| 126 |
-
|
| 127 |
# Check for common issues with special characters
|
| 128 |
results.extend(self._check_special_chars(tex_content, lines))
|
| 129 |
|
|
@@ -159,8 +123,9 @@ class FormattingChecker(BaseChecker):
|
|
| 159 |
severity=CheckSeverity.WARNING,
|
| 160 |
message="Unescaped & outside tabular/math environment",
|
| 161 |
line_number=line_num,
|
| 162 |
-
line_content=line.strip()
|
| 163 |
-
suggestion="Use \\& to escape"
|
|
|
|
| 164 |
))
|
| 165 |
|
| 166 |
return results
|
|
|
|
| 41 |
'^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
|
| 42 |
}
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 45 |
results = []
|
| 46 |
lines = tex_content.split('\n')
|
|
|
|
| 63 |
severity=CheckSeverity.INFO,
|
| 64 |
message="Citation without non-breaking space",
|
| 65 |
line_number=line_num,
|
| 66 |
+
line_content=line.strip(),
|
| 67 |
+
suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')",
|
| 68 |
+
match_text=match.group(0),
|
| 69 |
))
|
| 70 |
|
| 71 |
# Track citation styles
|
|
|
|
| 88 |
suggestion="Consider using consistent citation style throughout"
|
| 89 |
))
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
# Check for common issues with special characters
|
| 92 |
results.extend(self._check_special_chars(tex_content, lines))
|
| 93 |
|
|
|
|
| 123 |
severity=CheckSeverity.WARNING,
|
| 124 |
message="Unescaped & outside tabular/math environment",
|
| 125 |
line_number=line_num,
|
| 126 |
+
line_content=line.strip(),
|
| 127 |
+
suggestion="Use \\& to escape",
|
| 128 |
+
match_text=match.group(0),
|
| 129 |
))
|
| 130 |
|
| 131 |
return results
|
src/checkers/retraction_checker.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Bibliography-level checker that flags retracted DOIs.
|
| 3 |
+
|
| 4 |
+
Unlike the LaTeX-line checkers in src/checkers/, this one operates on parsed
|
| 5 |
+
BibEntry objects, not on a tex_content string. main.py / app.py invoke it
|
| 6 |
+
directly via `check_entries(entries)`.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import concurrent.futures
|
| 11 |
+
import logging
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from typing import Iterable, List
|
| 14 |
+
|
| 15 |
+
from src.fetchers.retraction_fetcher import RetractionFetcher, RetractionResult
|
| 16 |
+
from src.parsers.bib_parser import BibEntry
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class RetractionFinding:
|
| 23 |
+
entry_key: str
|
| 24 |
+
doi: str
|
| 25 |
+
result: RetractionResult
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RetractionChecker:
|
| 29 |
+
"""Concurrent batch retraction lookup."""
|
| 30 |
+
|
| 31 |
+
def __init__(self, max_workers: int = 6):
|
| 32 |
+
self.fetcher = RetractionFetcher()
|
| 33 |
+
self.max_workers = max_workers
|
| 34 |
+
|
| 35 |
+
def check_entries(self, entries: Iterable[BibEntry]) -> List[RetractionFinding]:
|
| 36 |
+
"""Look up retraction status for every entry that has a DOI."""
|
| 37 |
+
with_doi = [e for e in entries if getattr(e, "doi", "")]
|
| 38 |
+
if not with_doi:
|
| 39 |
+
return []
|
| 40 |
+
|
| 41 |
+
findings: List[RetractionFinding] = []
|
| 42 |
+
|
| 43 |
+
def _one(entry: BibEntry):
|
| 44 |
+
res = self.fetcher.check(entry.doi)
|
| 45 |
+
return entry, res
|
| 46 |
+
|
| 47 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
| 48 |
+
for entry, res in ex.map(_one, with_doi):
|
| 49 |
+
if res is None:
|
| 50 |
+
continue
|
| 51 |
+
if res.is_retracted or res.update_type:
|
| 52 |
+
findings.append(RetractionFinding(entry.key, entry.doi, res))
|
| 53 |
+
return findings
|
src/checkers/sentence_checker.py
CHANGED
|
@@ -76,7 +76,7 @@ class SentenceChecker(BaseChecker):
|
|
| 76 |
severity=CheckSeverity.INFO,
|
| 77 |
message=message,
|
| 78 |
line_number=line_num,
|
| 79 |
-
line_content=line.strip()
|
| 80 |
))
|
| 81 |
break # One per line
|
| 82 |
|
|
|
|
| 76 |
severity=CheckSeverity.INFO,
|
| 77 |
message=message,
|
| 78 |
line_number=line_num,
|
| 79 |
+
line_content=line.strip()
|
| 80 |
))
|
| 81 |
break # One per line
|
| 82 |
|
src/checkers/template_checker.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Conference-template conformance checker.
|
| 3 |
+
|
| 4 |
+
Reads the rich rule set defined in :mod:`src.templates.base_template` and runs
|
| 5 |
+
per-venue checks against the LaTeX source. Each rule fragment lives in its own
|
| 6 |
+
small private method so adding new conferences (or new rules) doesn't bloat the
|
| 7 |
+
public ``check`` method.
|
| 8 |
+
|
| 9 |
+
Severity convention used here:
|
| 10 |
+
|
| 11 |
+
* ``ERROR`` — desk-reject material if uncorrected (NeurIPS missing checklist,
|
| 12 |
+
ACL missing Limitations, double-blind \\author leak).
|
| 13 |
+
* ``WARNING`` — likely a real problem but might be a false positive (style
|
| 14 |
+
package mismatch, identifying URL).
|
| 15 |
+
* ``INFO`` — soft reminder that something MUST happen later (camera-ready
|
| 16 |
+
sections, lay summaries, font requirements, page-limit
|
| 17 |
+
estimation that the .tex source can't actually verify).
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import re
|
| 22 |
+
from typing import List, Optional
|
| 23 |
+
|
| 24 |
+
from .base import BaseChecker, CheckResult, CheckSeverity
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ------------------------------------------------------------------ helpers ---
|
| 28 |
+
|
| 29 |
+
# Match \section{X}, \subsection{X}, \paragraph{X}, optionally starred,
|
| 30 |
+
# allowing an optional [short] argument before the {body}.
|
| 31 |
+
def _section_pattern(name: str) -> re.Pattern:
|
| 32 |
+
return re.compile(
|
| 33 |
+
r'\\(?:section|subsection|paragraph)\*?\s*(?:\[[^\]]*\])?\s*\{[^}]*?'
|
| 34 |
+
+ re.escape(name) + r'[^}]*\}',
|
| 35 |
+
re.IGNORECASE,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Domains/URL patterns that strongly de-anonymize an author. Whitelisted
|
| 40 |
+
# domains (which legitimately appear in CV/ML papers without leaking identity)
|
| 41 |
+
# are excluded.
|
| 42 |
+
_IDENTIFYING_URL_PATTERNS = [
|
| 43 |
+
re.compile(r'\bgithub\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
|
| 44 |
+
re.compile(r'\b[A-Za-z0-9_\-]+\.github\.io\b', re.IGNORECASE),
|
| 45 |
+
re.compile(r'\bgitlab\.com/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
|
| 46 |
+
re.compile(r'\bbitbucket\.org/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
|
| 47 |
+
re.compile(r'\b(?:huggingface\.co|wandb\.ai)/(?!anonymous)[A-Za-z0-9_\-]+/', re.IGNORECASE),
|
| 48 |
+
re.compile(r'\b(?:linkedin|twitter|x)\.com/[A-Za-z0-9_\-]+', re.IGNORECASE),
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# URLs that are explicitly anonymous-friendly and should NOT be flagged.
|
| 52 |
+
_ANONYMOUS_URL_HINTS = re.compile(
|
| 53 |
+
r'(anonymous|anon|blind|review|submission|4open\.science)', re.IGNORECASE,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Capture URLs from \url{...}, \href{...}{...}, and bare http(s)://...
|
| 57 |
+
_URL_FROM_TEX = re.compile(
|
| 58 |
+
r'\\(?:url|href)\s*\{([^}]+)\}|(?<![/\w])(https?://[^\s,)\\]+)',
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Acknowledgments macros / sections used by various templates.
|
| 62 |
+
_ACK_PATTERNS = [
|
| 63 |
+
re.compile(r'\\section\*?\s*\{\s*Acknowledg\w*\s*\}', re.IGNORECASE),
|
| 64 |
+
re.compile(r'\\acknowledgments?\s*\{', re.IGNORECASE),
|
| 65 |
+
re.compile(r'\\begin\{acks\}', re.IGNORECASE),
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
# NeurIPS Paper Checklist markers — the official template either calls
|
| 69 |
+
# \input{neurips_paper_checklist} or includes a \section*{NeurIPS Paper Checklist}.
|
| 70 |
+
_NEURIPS_CHECKLIST_PATTERNS = [
|
| 71 |
+
re.compile(r'\\section\*?\s*\{[^}]*Paper\s+Checklist[^}]*\}', re.IGNORECASE),
|
| 72 |
+
re.compile(r'\\input\{[^}]*paper[_\-]?checklist[^}]*\}', re.IGNORECASE),
|
| 73 |
+
re.compile(r'\\input\{[^}]*neurips[_\-]?\d{0,4}[_\-]?checklist[^}]*\}', re.IGNORECASE),
|
| 74 |
+
re.compile(r'\\paperchecklist\b', re.IGNORECASE),
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
# Reproducibility Statement (ICLR / NeurIPS).
|
| 78 |
+
_REPRO_SECTION = re.compile(
|
| 79 |
+
r'\\section\*?\s*\{[^}]*Reproducibility[^}]*\}', re.IGNORECASE,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Document-class options carry the paper size.
|
| 83 |
+
_DOCCLASS_RE = re.compile(
|
| 84 |
+
r'\\documentclass\s*(?:\[([^\]]*)\])?\s*\{([^}]+)\}'
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# A very rough regex for figures/tables INSIDE the Limitations section
|
| 88 |
+
# (used to enforce ACL "discussion only" rule).
|
| 89 |
+
_FLOAT_OR_NEW_SECTION_RE = re.compile(
|
| 90 |
+
r'\\begin\{(?:table|figure|algorithm)\*?\}|\\section\*?\s*\{', re.IGNORECASE,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ----------------------------------------------------------------- checker ---
|
| 95 |
+
|
| 96 |
+
class TemplateChecker(BaseChecker):
|
| 97 |
+
name = "template"
|
| 98 |
+
display_name = "Conference Template"
|
| 99 |
+
description = "Verify per-venue submission rules (sections, style, anonymity, deliverables)"
|
| 100 |
+
|
| 101 |
+
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
|
| 102 |
+
config = config or {}
|
| 103 |
+
template = config.get("template")
|
| 104 |
+
if template is None:
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
content = self._remove_comments(tex_content)
|
| 108 |
+
results: List[CheckResult] = []
|
| 109 |
+
|
| 110 |
+
self._check_mandatory_sections(template, content, results)
|
| 111 |
+
self._check_camera_only_sections(template, content, results)
|
| 112 |
+
self._check_style_package(template, content, results)
|
| 113 |
+
self._check_doc_class(template, content, results)
|
| 114 |
+
self._check_paper_size(template, content, results)
|
| 115 |
+
|
| 116 |
+
if template.double_blind:
|
| 117 |
+
self._check_double_blind_author(template, content, results)
|
| 118 |
+
if template.forbid_identifying_urls:
|
| 119 |
+
self._check_identifying_urls(template, content, results)
|
| 120 |
+
if template.forbid_acks_in_review:
|
| 121 |
+
self._check_acknowledgments(template, content, results)
|
| 122 |
+
|
| 123 |
+
if template.requires_paper_checklist:
|
| 124 |
+
self._check_paper_checklist(template, content, results)
|
| 125 |
+
if template.requires_reproducibility_statement:
|
| 126 |
+
self._check_reproducibility_statement(template, content, results)
|
| 127 |
+
if template.requires_lay_summary_camera:
|
| 128 |
+
self._inform_lay_summary(template, results)
|
| 129 |
+
if template.requires_type1_fonts:
|
| 130 |
+
self._inform_type1_fonts(template, results)
|
| 131 |
+
if template.min_main_pages > 0:
|
| 132 |
+
self._inform_min_pages(template, results)
|
| 133 |
+
|
| 134 |
+
if "Limitations" in template.mandatory_sections:
|
| 135 |
+
self._check_limitations_content(template, content, results)
|
| 136 |
+
|
| 137 |
+
return results
|
| 138 |
+
|
| 139 |
+
# ============================================================== sections ==
|
| 140 |
+
|
| 141 |
+
def _check_mandatory_sections(self, template, content: str, results: List[CheckResult]):
|
| 142 |
+
for section in template.mandatory_sections or []:
|
| 143 |
+
if not _section_pattern(section).search(content):
|
| 144 |
+
results.append(self._create_result(
|
| 145 |
+
passed=False,
|
| 146 |
+
severity=CheckSeverity.ERROR,
|
| 147 |
+
message=f"[{template.name}] Missing mandatory section: '{section}'",
|
| 148 |
+
suggestion=f"Add `\\section{{{section}}}` (required by {template.name}).",
|
| 149 |
+
))
|
| 150 |
+
|
| 151 |
+
def _check_camera_only_sections(self, template, content: str, results: List[CheckResult]):
|
| 152 |
+
for section in template.mandatory_camera_sections or []:
|
| 153 |
+
if not _section_pattern(section).search(content):
|
| 154 |
+
results.append(self._create_result(
|
| 155 |
+
passed=False,
|
| 156 |
+
severity=CheckSeverity.INFO,
|
| 157 |
+
message=(
|
| 158 |
+
f"[{template.name}] Camera-ready section '{section}' not found. "
|
| 159 |
+
"Required for the camera-ready version, optional for review."
|
| 160 |
+
),
|
| 161 |
+
suggestion=f"Add `\\section{{{section}}}` before References for camera-ready.",
|
| 162 |
+
))
|
| 163 |
+
|
| 164 |
+
# =================================================== style / typesetting ==
|
| 165 |
+
|
| 166 |
+
def _check_style_package(self, template, content: str, results: List[CheckResult]):
|
| 167 |
+
pkg = (template.style_package or "").strip()
|
| 168 |
+
if not pkg:
|
| 169 |
+
return
|
| 170 |
+
pkg_re = re.compile(
|
| 171 |
+
r'\\(?:usepackage|documentclass)(?:\[[^\]]*\])?\s*\{\s*'
|
| 172 |
+
+ re.escape(pkg) + r'\s*\}'
|
| 173 |
+
)
|
| 174 |
+
if not pkg_re.search(content):
|
| 175 |
+
results.append(self._create_result(
|
| 176 |
+
passed=False,
|
| 177 |
+
severity=CheckSeverity.WARNING,
|
| 178 |
+
message=(
|
| 179 |
+
f"[{template.name}] Style package '{pkg}' not found. "
|
| 180 |
+
"If you really are submitting to this venue, your template may be wrong."
|
| 181 |
+
),
|
| 182 |
+
suggestion=f"Use the official `{pkg}` style package.",
|
| 183 |
+
))
|
| 184 |
+
|
| 185 |
+
def _check_doc_class(self, template, content: str, results: List[CheckResult]):
|
| 186 |
+
wanted = (template.doc_class or "").strip()
|
| 187 |
+
if not wanted:
|
| 188 |
+
return
|
| 189 |
+
m = _DOCCLASS_RE.search(content)
|
| 190 |
+
actual = m.group(2).strip() if m else ""
|
| 191 |
+
if actual.lower() != wanted.lower():
|
| 192 |
+
results.append(self._create_result(
|
| 193 |
+
passed=False,
|
| 194 |
+
severity=CheckSeverity.WARNING,
|
| 195 |
+
message=(
|
| 196 |
+
f"[{template.name}] Expected `\\documentclass{{{wanted}}}`, "
|
| 197 |
+
f"found `{actual or 'none'}`."
|
| 198 |
+
),
|
| 199 |
+
suggestion=f"Use the official document class `{wanted}` (Springer LNCS for ECCV).",
|
| 200 |
+
))
|
| 201 |
+
|
| 202 |
+
def _check_paper_size(self, template, content: str, results: List[CheckResult]):
|
| 203 |
+
wanted = (template.paper_size or "").lower()
|
| 204 |
+
if wanted not in {"letter", "a4"}:
|
| 205 |
+
return
|
| 206 |
+
m = _DOCCLASS_RE.search(content)
|
| 207 |
+
if not m:
|
| 208 |
+
return
|
| 209 |
+
opts = (m.group(1) or "").lower()
|
| 210 |
+
actual = None
|
| 211 |
+
if "letterpaper" in opts or "letter" in opts:
|
| 212 |
+
actual = "letter"
|
| 213 |
+
elif "a4paper" in opts or "a4" in opts:
|
| 214 |
+
actual = "a4"
|
| 215 |
+
if actual and actual != wanted:
|
| 216 |
+
results.append(self._create_result(
|
| 217 |
+
passed=False,
|
| 218 |
+
severity=CheckSeverity.WARNING,
|
| 219 |
+
message=(
|
| 220 |
+
f"[{template.name}] Expected paper size '{wanted}', "
|
| 221 |
+
f"document class is set to '{actual}'."
|
| 222 |
+
),
|
| 223 |
+
suggestion=f"Use `\\documentclass[{wanted}paper]{{...}}`.",
|
| 224 |
+
))
|
| 225 |
+
|
| 226 |
+
# ================================================================ blinding =
|
| 227 |
+
|
| 228 |
+
def _check_double_blind_author(self, template, content: str, results: List[CheckResult]):
|
| 229 |
+
m = re.search(r'\\author\s*(?:\[[^\]]*\])?\s*\{([^}]*)\}', content)
|
| 230 |
+
if not m:
|
| 231 |
+
return
|
| 232 |
+
body = m.group(1)
|
| 233 |
+
if not body.strip():
|
| 234 |
+
return
|
| 235 |
+
if re.search(r'(anonymous|hidden|blind|submission)', body, re.IGNORECASE):
|
| 236 |
+
return
|
| 237 |
+
line_num = self._find_line_number(content, m.start())
|
| 238 |
+
results.append(self._create_result(
|
| 239 |
+
passed=False,
|
| 240 |
+
severity=CheckSeverity.ERROR,
|
| 241 |
+
message=f"[{template.name}] Double-blind: \\author appears to contain identifying info",
|
| 242 |
+
line_number=line_num,
|
| 243 |
+
line_content=body.strip(),
|
| 244 |
+
suggestion=r"Replace \author with anonymous placeholder during review.",
|
| 245 |
+
))
|
| 246 |
+
|
| 247 |
+
def _check_identifying_urls(self, template, content: str, results: List[CheckResult]):
|
| 248 |
+
for m in _URL_FROM_TEX.finditer(content):
|
| 249 |
+
url = (m.group(1) or m.group(2) or "").strip()
|
| 250 |
+
if not url:
|
| 251 |
+
continue
|
| 252 |
+
if _ANONYMOUS_URL_HINTS.search(url):
|
| 253 |
+
continue
|
| 254 |
+
for pat in _IDENTIFYING_URL_PATTERNS:
|
| 255 |
+
if pat.search(url):
|
| 256 |
+
line_num = self._find_line_number(content, m.start())
|
| 257 |
+
results.append(self._create_result(
|
| 258 |
+
passed=False,
|
| 259 |
+
severity=CheckSeverity.WARNING,
|
| 260 |
+
message=(
|
| 261 |
+
f"[{template.name}] Possible identifying URL during double-blind review: "
|
| 262 |
+
f"{url[:120]}"
|
| 263 |
+
),
|
| 264 |
+
line_number=line_num,
|
| 265 |
+
line_content=url,
|
| 266 |
+
suggestion=(
|
| 267 |
+
"Use Anonymous GitHub (https://anonymous.4open.science) or remove "
|
| 268 |
+
"the link until the camera-ready version."
|
| 269 |
+
),
|
| 270 |
+
))
|
| 271 |
+
break # one finding per URL
|
| 272 |
+
|
| 273 |
+
def _check_acknowledgments(self, template, content: str, results: List[CheckResult]):
|
| 274 |
+
for pat in _ACK_PATTERNS:
|
| 275 |
+
m = pat.search(content)
|
| 276 |
+
if m:
|
| 277 |
+
line_num = self._find_line_number(content, m.start())
|
| 278 |
+
results.append(self._create_result(
|
| 279 |
+
passed=False,
|
| 280 |
+
severity=CheckSeverity.WARNING,
|
| 281 |
+
message=(
|
| 282 |
+
f"[{template.name}] Acknowledgments section detected; "
|
| 283 |
+
f"{template.short_name.upper()} requires omitting it during review."
|
| 284 |
+
),
|
| 285 |
+
line_number=line_num,
|
| 286 |
+
suggestion=(
|
| 287 |
+
"Comment out or wrap acks in `\\if<reviewmode>...\\fi` so they only "
|
| 288 |
+
"appear in the camera-ready version."
|
| 289 |
+
),
|
| 290 |
+
))
|
| 291 |
+
return # one finding is enough
|
| 292 |
+
|
| 293 |
+
# ============================================== per-venue special items ===
|
| 294 |
+
|
| 295 |
+
def _check_paper_checklist(self, template, content: str, results: List[CheckResult]):
|
| 296 |
+
for pat in _NEURIPS_CHECKLIST_PATTERNS:
|
| 297 |
+
if pat.search(content):
|
| 298 |
+
return
|
| 299 |
+
results.append(self._create_result(
|
| 300 |
+
passed=False,
|
| 301 |
+
severity=CheckSeverity.ERROR,
|
| 302 |
+
message=(
|
| 303 |
+
f"[{template.name}] NeurIPS Paper Checklist not found. "
|
| 304 |
+
"NeurIPS desk-rejects submissions without the checklist."
|
| 305 |
+
),
|
| 306 |
+
suggestion=(
|
| 307 |
+
"Add `\\input{neurips_paper_checklist}` (or paste the official template) "
|
| 308 |
+
"after References / supplementary."
|
| 309 |
+
),
|
| 310 |
+
))
|
| 311 |
+
|
| 312 |
+
def _check_reproducibility_statement(self, template, content: str, results: List[CheckResult]):
|
| 313 |
+
if _REPRO_SECTION.search(content):
|
| 314 |
+
return
|
| 315 |
+
results.append(self._create_result(
|
| 316 |
+
passed=False,
|
| 317 |
+
severity=CheckSeverity.INFO,
|
| 318 |
+
message=(
|
| 319 |
+
f"[{template.name}] Reproducibility Statement not found. "
|
| 320 |
+
"It's encouraged (~1 page) and does not count toward the page limit."
|
| 321 |
+
),
|
| 322 |
+
suggestion=(
|
| 323 |
+
"Add `\\section*{Reproducibility Statement}` before References summarizing "
|
| 324 |
+
"code/data/seeds/hyperparameter availability."
|
| 325 |
+
),
|
| 326 |
+
))
|
| 327 |
+
|
| 328 |
+
def _inform_lay_summary(self, template, results: List[CheckResult]):
|
| 329 |
+
results.append(self._create_result(
|
| 330 |
+
passed=False,
|
| 331 |
+
severity=CheckSeverity.INFO,
|
| 332 |
+
message=(
|
| 333 |
+
f"[{template.name}] Lay summary required at camera-ready time "
|
| 334 |
+
"(plain-language summary submitted via OpenReview)."
|
| 335 |
+
),
|
| 336 |
+
suggestion="Draft a 1–2 paragraph plain-language summary now to avoid a last-minute scramble.",
|
| 337 |
+
))
|
| 338 |
+
|
| 339 |
+
def _inform_type1_fonts(self, template, results: List[CheckResult]):
|
| 340 |
+
results.append(self._create_result(
|
| 341 |
+
passed=False,
|
| 342 |
+
severity=CheckSeverity.INFO,
|
| 343 |
+
message=(
|
| 344 |
+
f"[{template.name}] Embedded fonts must be Type-1 only — verify with "
|
| 345 |
+
"`pdffonts <paper.pdf>`. Cannot be checked from .tex source alone."
|
| 346 |
+
),
|
| 347 |
+
suggestion="Compile with `pdflatex` (not XeLaTeX/LuaLaTeX) and convert any Type-3 fonts.",
|
| 348 |
+
))
|
| 349 |
+
|
| 350 |
+
def _inform_min_pages(self, template, results: List[CheckResult]):
|
| 351 |
+
results.append(self._create_result(
|
| 352 |
+
passed=False,
|
| 353 |
+
severity=CheckSeverity.INFO,
|
| 354 |
+
message=(
|
| 355 |
+
f"[{template.name}] Main text must be at least {template.min_main_pages} pages "
|
| 356 |
+
f"and at most {template.page_limit_review} pages. Cannot be measured from source."
|
| 357 |
+
),
|
| 358 |
+
suggestion=(
|
| 359 |
+
f"Compile and confirm the rendered PDF stays within "
|
| 360 |
+
f"{template.min_main_pages}–{template.page_limit_review} pages of main text."
|
| 361 |
+
),
|
| 362 |
+
))
|
| 363 |
+
|
| 364 |
+
# ============================================ ACL family: Limitations rule
|
| 365 |
+
|
| 366 |
+
def _check_limitations_content(self, template, content: str, results: List[CheckResult]):
|
| 367 |
+
# Find the Limitations section span up to the next \section or end of doc.
|
| 368 |
+
m = re.search(
|
| 369 |
+
r'(\\section\*?\s*(?:\[[^\]]*\])?\s*\{[^}]*Limitations[^}]*\})',
|
| 370 |
+
content, re.IGNORECASE,
|
| 371 |
+
)
|
| 372 |
+
if not m:
|
| 373 |
+
return # mandatory_sections check already flagged absence
|
| 374 |
+
start = m.end()
|
| 375 |
+
nxt = re.search(r'\\section\*?\s*\{', content[start:], re.IGNORECASE)
|
| 376 |
+
end = start + nxt.start() if nxt else len(content)
|
| 377 |
+
section_body = content[start:end]
|
| 378 |
+
# Discussion-only rule: no floats, no nested \section
|
| 379 |
+
if _FLOAT_OR_NEW_SECTION_RE.search(section_body):
|
| 380 |
+
line_num = self._find_line_number(content, start)
|
| 381 |
+
results.append(self._create_result(
|
| 382 |
+
passed=False,
|
| 383 |
+
severity=CheckSeverity.WARNING,
|
| 384 |
+
message=(
|
| 385 |
+
f"[{template.name}] Limitations section appears to contain floats or a "
|
| 386 |
+
"nested section. ACL/EMNLP/NAACL require Limitations to be discussion only."
|
| 387 |
+
),
|
| 388 |
+
line_number=line_num,
|
| 389 |
+
suggestion=(
|
| 390 |
+
"Move tables/figures/algorithms out of Limitations into the main body or "
|
| 391 |
+
"appendix; Limitations should be prose-only."
|
| 392 |
+
),
|
| 393 |
+
))
|
src/checkers/url_checker.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
URL liveness checker for bibliography entries.
|
| 3 |
+
|
| 4 |
+
Many @misc / blog / repo references rot over time. This checker does a HEAD
|
| 5 |
+
(falling back to a small GET) on entry.url and flags anything that returns
|
| 6 |
+
4xx/5xx or fails to connect.
|
| 7 |
+
|
| 8 |
+
Operates on BibEntry objects, not on tex_content. Invoked from main.py / app.py
|
| 9 |
+
when `submission_extra.url_liveness` is true.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import concurrent.futures
|
| 14 |
+
import logging
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from typing import Iterable, List, Optional
|
| 17 |
+
|
| 18 |
+
import requests
|
| 19 |
+
|
| 20 |
+
from src.utils.http import get_session
|
| 21 |
+
from src.parsers.bib_parser import BibEntry
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class URLFinding:
|
| 28 |
+
entry_key: str
|
| 29 |
+
url: str
|
| 30 |
+
status: str # "ok" | "broken" | "unreachable" | "skipped"
|
| 31 |
+
status_code: Optional[int] = None
|
| 32 |
+
detail: str = ""
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class URLChecker:
|
| 36 |
+
"""Concurrent HEAD-then-GET liveness check."""
|
| 37 |
+
|
| 38 |
+
SKIP_PREFIXES = ("mailto:", "ftp://", "tel:", "javascript:")
|
| 39 |
+
|
| 40 |
+
def __init__(self, max_workers: int = 8, timeout: float = 15.0):
|
| 41 |
+
self.max_workers = max_workers
|
| 42 |
+
self.timeout = timeout
|
| 43 |
+
|
| 44 |
+
def _check_one(self, entry: BibEntry) -> Optional[URLFinding]:
|
| 45 |
+
url = (entry.url or "").strip()
|
| 46 |
+
if not url:
|
| 47 |
+
return None
|
| 48 |
+
if any(url.lower().startswith(p) for p in self.SKIP_PREFIXES):
|
| 49 |
+
return URLFinding(entry.key, url, "skipped", detail="non-http scheme")
|
| 50 |
+
|
| 51 |
+
session = get_session()
|
| 52 |
+
try:
|
| 53 |
+
r = session.head(url, allow_redirects=True, timeout=self.timeout)
|
| 54 |
+
# Many servers return 405/403 for HEAD but are fine with GET; double-check with a tiny GET.
|
| 55 |
+
if r.status_code in (403, 405, 501):
|
| 56 |
+
r = session.get(url, allow_redirects=True, timeout=self.timeout, stream=True)
|
| 57 |
+
# Don't actually read the body
|
| 58 |
+
r.close()
|
| 59 |
+
except requests.RequestException as e:
|
| 60 |
+
logger.debug("URL check failed for %s: %s", url, e, exc_info=True)
|
| 61 |
+
return URLFinding(entry.key, url, "unreachable", detail=str(e)[:120])
|
| 62 |
+
|
| 63 |
+
if 200 <= r.status_code < 400:
|
| 64 |
+
return URLFinding(entry.key, url, "ok", status_code=r.status_code)
|
| 65 |
+
return URLFinding(
|
| 66 |
+
entry.key, url, "broken",
|
| 67 |
+
status_code=r.status_code,
|
| 68 |
+
detail=f"HTTP {r.status_code}",
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def check_entries(self, entries: Iterable[BibEntry]) -> List[URLFinding]:
|
| 72 |
+
targets = [e for e in entries if getattr(e, "url", "")]
|
| 73 |
+
if not targets:
|
| 74 |
+
return []
|
| 75 |
+
findings: List[URLFinding] = []
|
| 76 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
| 77 |
+
for f in ex.map(self._check_one, targets):
|
| 78 |
+
if f is not None:
|
| 79 |
+
findings.append(f)
|
| 80 |
+
return findings
|
src/config/__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (362 Bytes)
|
|
|
src/config/__pycache__/workflow.cpython-313.pyc
DELETED
|
Binary file (7.96 kB)
|
|
|
src/config/__pycache__/yaml_config.cpython-313.pyc
DELETED
|
Binary file (12.4 kB)
|
|
|
src/config/yaml_config.py
CHANGED
|
@@ -97,11 +97,36 @@ class LLMConfig:
|
|
| 97 |
api_key: str = ""
|
| 98 |
|
| 99 |
|
| 100 |
-
@dataclass
|
| 101 |
class OutputConfig:
|
| 102 |
"""Output configuration."""
|
| 103 |
quiet: bool = False
|
| 104 |
minimal_verified: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
@dataclass
|
|
@@ -111,9 +136,12 @@ class BibGuardConfig:
|
|
| 111 |
template: str = ""
|
| 112 |
bibliography: BibliographyConfig = field(default_factory=BibliographyConfig)
|
| 113 |
submission: SubmissionConfig = field(default_factory=SubmissionConfig)
|
|
|
|
| 114 |
workflow: List[WorkflowStep] = field(default_factory=list)
|
| 115 |
llm: LLMConfig = field(default_factory=LLMConfig)
|
| 116 |
output: OutputConfig = field(default_factory=OutputConfig)
|
|
|
|
|
|
|
| 117 |
|
| 118 |
# Internal fields to store discovered files in directory mode
|
| 119 |
_bib_files: List[Path] = field(default_factory=list)
|
|
@@ -225,11 +253,48 @@ def load_config(config_path: str) -> BibGuardConfig:
|
|
| 225 |
# Parse output section
|
| 226 |
if 'output' in data:
|
| 227 |
out = data['output']
|
|
|
|
|
|
|
|
|
|
| 228 |
config.output = OutputConfig(
|
| 229 |
quiet=out.get('quiet', False),
|
| 230 |
-
minimal_verified=out.get('minimal_verified', False)
|
|
|
|
| 231 |
)
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
return config
|
| 234 |
|
| 235 |
|
|
@@ -264,6 +329,15 @@ files:
|
|
| 264 |
|
| 265 |
template: ""
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
bibliography:
|
| 268 |
check_metadata: true
|
| 269 |
check_usage: true
|
|
@@ -285,16 +359,27 @@ submission:
|
|
| 285 |
citation_quality: true
|
| 286 |
anonymization: true
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
llm:
|
| 289 |
-
backend: "gemini"
|
| 290 |
-
model: ""
|
| 291 |
-
api_key: ""
|
| 292 |
|
| 293 |
output:
|
| 294 |
quiet: false
|
| 295 |
minimal_verified: false
|
|
|
|
| 296 |
"""
|
| 297 |
with open(output_path, 'w', encoding='utf-8') as f:
|
| 298 |
f.write(default)
|
| 299 |
-
|
| 300 |
return output_path
|
|
|
|
| 97 |
api_key: str = ""
|
| 98 |
|
| 99 |
|
| 100 |
+
@dataclass
|
| 101 |
class OutputConfig:
|
| 102 |
"""Output configuration."""
|
| 103 |
quiet: bool = False
|
| 104 |
minimal_verified: bool = False
|
| 105 |
+
formats: List[str] = field(default_factory=lambda: ["markdown", "html"]) # markdown, html, json
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@dataclass
|
| 109 |
+
class NetworkConfig:
|
| 110 |
+
"""Network / politeness configuration."""
|
| 111 |
+
contact_email: str = ""
|
| 112 |
+
cache_enabled: bool = True
|
| 113 |
+
cache_ttl_hours: int = 24
|
| 114 |
+
retry_total: int = 5
|
| 115 |
+
retry_backoff_factor: float = 1.5
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@dataclass
|
| 119 |
+
class GlossaryConfig:
|
| 120 |
+
"""User-supplied project glossary for ConsistencyChecker / AcronymChecker."""
|
| 121 |
+
preferred: List[str] = field(default_factory=list) # e.g. ["Transformer", "fine-tuning"]
|
| 122 |
+
acronyms: Dict[str, str] = field(default_factory=dict) # e.g. {"NLP": "Natural Language Processing"}
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@dataclass
|
| 126 |
+
class SubmissionExtraConfig:
|
| 127 |
+
"""Extra submission checks added on top of the original list."""
|
| 128 |
+
url_liveness: bool = False
|
| 129 |
+
retraction: bool = True
|
| 130 |
|
| 131 |
|
| 132 |
@dataclass
|
|
|
|
| 136 |
template: str = ""
|
| 137 |
bibliography: BibliographyConfig = field(default_factory=BibliographyConfig)
|
| 138 |
submission: SubmissionConfig = field(default_factory=SubmissionConfig)
|
| 139 |
+
submission_extra: SubmissionExtraConfig = field(default_factory=SubmissionExtraConfig)
|
| 140 |
workflow: List[WorkflowStep] = field(default_factory=list)
|
| 141 |
llm: LLMConfig = field(default_factory=LLMConfig)
|
| 142 |
output: OutputConfig = field(default_factory=OutputConfig)
|
| 143 |
+
network: NetworkConfig = field(default_factory=NetworkConfig)
|
| 144 |
+
glossary: GlossaryConfig = field(default_factory=GlossaryConfig)
|
| 145 |
|
| 146 |
# Internal fields to store discovered files in directory mode
|
| 147 |
_bib_files: List[Path] = field(default_factory=list)
|
|
|
|
| 253 |
# Parse output section
|
| 254 |
if 'output' in data:
|
| 255 |
out = data['output']
|
| 256 |
+
formats = out.get('formats', ["markdown", "html"])
|
| 257 |
+
if isinstance(formats, str):
|
| 258 |
+
formats = [f.strip() for f in formats.split(",") if f.strip()]
|
| 259 |
config.output = OutputConfig(
|
| 260 |
quiet=out.get('quiet', False),
|
| 261 |
+
minimal_verified=out.get('minimal_verified', False),
|
| 262 |
+
formats=list(formats),
|
| 263 |
)
|
| 264 |
+
|
| 265 |
+
# Parse network section
|
| 266 |
+
if 'network' in data:
|
| 267 |
+
net = data['network'] or {}
|
| 268 |
+
config.network = NetworkConfig(
|
| 269 |
+
contact_email=net.get('contact_email', ''),
|
| 270 |
+
cache_enabled=bool(net.get('cache_enabled', True)),
|
| 271 |
+
cache_ttl_hours=int(net.get('cache_ttl_hours', 24)),
|
| 272 |
+
retry_total=int(net.get('retry_total', 5)),
|
| 273 |
+
retry_backoff_factor=float(net.get('retry_backoff_factor', 1.5)),
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# Parse glossary section
|
| 277 |
+
if 'glossary' in data:
|
| 278 |
+
g = data['glossary'] or {}
|
| 279 |
+
preferred = g.get('preferred', []) or []
|
| 280 |
+
acronyms = g.get('acronyms', {}) or {}
|
| 281 |
+
if not isinstance(preferred, list):
|
| 282 |
+
preferred = [str(preferred)]
|
| 283 |
+
if not isinstance(acronyms, dict):
|
| 284 |
+
acronyms = {}
|
| 285 |
+
config.glossary = GlossaryConfig(
|
| 286 |
+
preferred=[str(x) for x in preferred],
|
| 287 |
+
acronyms={str(k): str(v) for k, v in acronyms.items()},
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
# Parse submission_extra section (URL liveness, retraction)
|
| 291 |
+
if 'submission_extra' in data:
|
| 292 |
+
sx = data['submission_extra'] or {}
|
| 293 |
+
config.submission_extra = SubmissionExtraConfig(
|
| 294 |
+
url_liveness=bool(sx.get('url_liveness', False)),
|
| 295 |
+
retraction=bool(sx.get('retraction', True)),
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
return config
|
| 299 |
|
| 300 |
|
|
|
|
| 329 |
|
| 330 |
template: ""
|
| 331 |
|
| 332 |
+
network:
|
| 333 |
+
# Real email used in polite-pool User-Agents (arXiv/CrossRef/OpenAlex).
|
| 334 |
+
# Strongly recommended.
|
| 335 |
+
contact_email: ""
|
| 336 |
+
cache_enabled: true # Local SQLite cache for HTTP responses
|
| 337 |
+
cache_ttl_hours: 24
|
| 338 |
+
retry_total: 5
|
| 339 |
+
retry_backoff_factor: 1.5
|
| 340 |
+
|
| 341 |
bibliography:
|
| 342 |
check_metadata: true
|
| 343 |
check_usage: true
|
|
|
|
| 359 |
citation_quality: true
|
| 360 |
anonymization: true
|
| 361 |
|
| 362 |
+
submission_extra:
|
| 363 |
+
url_liveness: false # HEAD-check every entry.url field (slow, off by default)
|
| 364 |
+
retraction: true # Flag retracted DOIs via CrossRef
|
| 365 |
+
|
| 366 |
+
# Project-specific glossary helps ConsistencyChecker and AcronymChecker
|
| 367 |
+
# avoid false positives and enforce house style.
|
| 368 |
+
glossary:
|
| 369 |
+
preferred: [] # e.g. ["Transformer", "fine-tuning"]
|
| 370 |
+
acronyms: {} # e.g. {NLP: "Natural Language Processing"}
|
| 371 |
+
|
| 372 |
llm:
|
| 373 |
+
backend: "gemini" # gemini | openai | anthropic | deepseek | ollama | vllm
|
| 374 |
+
model: "" # leave empty for sensible default per backend
|
| 375 |
+
api_key: "" # prefer env var <BACKEND>_API_KEY
|
| 376 |
|
| 377 |
output:
|
| 378 |
quiet: false
|
| 379 |
minimal_verified: false
|
| 380 |
+
formats: [markdown, html] # any of: markdown, html, json
|
| 381 |
"""
|
| 382 |
with open(output_path, 'w', encoding='utf-8') as f:
|
| 383 |
f.write(default)
|
| 384 |
+
|
| 385 |
return output_path
|