CiteScan / src /report /generator.py
aivolcano
Initial commit
350babd
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, List
from pathlib import Path
from ..parsers.bib_parser import BibEntry
from ..analyzers.metadata_comparator import ComparisonResult
from ..analyzers.duplicate_detector import DuplicateGroup
@dataclass
class EntryReport:
"""Report for a single bib entry (bib-only: entry + comparison)."""
entry: BibEntry
comparison: Optional[ComparisonResult]
class ReportGenerator:
"""Generates bibliography-only markdown reports."""
def __init__(
self,
minimal_verified: bool = False,
check_preprint_ratio: bool = True,
preprint_warning_threshold: float = 0.50,
):
self.entries: List[EntryReport] = []
self.duplicate_groups: Optional[List[DuplicateGroup]] = None
self.bib_files: List[str] = []
self.minimal_verified = minimal_verified
self.check_preprint_ratio = check_preprint_ratio
self.preprint_warning_threshold = preprint_warning_threshold
def add_entry_report(self, report: EntryReport):
self.entries.append(report)
def set_metadata(self, bib_files: str | List[str], tex_files: str | List[str] = None):
if isinstance(bib_files, str):
self.bib_files = [bib_files]
else:
self.bib_files = list(bib_files) if bib_files else []
def set_duplicate_groups(self, groups: List[DuplicateGroup]):
self.duplicate_groups = groups
def _is_verified(self, entry: EntryReport) -> bool:
return not self._has_issues(entry)
def _has_issues(self, entry: EntryReport) -> bool:
return bool(entry.comparison and entry.comparison.has_issues)
def _is_preprint(self, entry: BibEntry) -> bool:
preprint_keywords = [
"arxiv", "biorxiv", "medrxiv", "ssrn", "preprint",
"openreview", "techreport", "technical report", "working paper",
]
if entry.entry_type.lower() in ["techreport", "unpublished", "misc"]:
text = " ".join([
entry.journal.lower(), entry.booktitle.lower(),
entry.publisher.lower(), entry.entry_type.lower(),
])
if any(k in text for k in preprint_keywords):
return True
if entry.has_arxiv:
return True
venue = " ".join([entry.journal.lower(), entry.booktitle.lower(), entry.publisher.lower()])
return any(k in venue for k in preprint_keywords)
def get_summary_stats(self) -> dict:
"""Return bibliography issue counts only (no LaTeX)."""
total = len(self.entries)
title_mismatches = author_mismatches = year_mismatches = unable_to_verify = 0
for e in self.entries:
if not e.comparison:
continue
if e.comparison.has_issues:
for issue in e.comparison.issues:
if "Title mismatch" in issue:
title_mismatches += 1
elif "Author mismatch" in issue:
author_mismatches += 1
elif "Year mismatch" in issue:
year_mismatches += 1
elif "Unable to find" in issue:
unable_to_verify += 1
stats = {}
if title_mismatches > 0:
stats["Title Mismatches"] = title_mismatches
if author_mismatches > 0:
stats["Author Mismatches"] = author_mismatches
if year_mismatches > 0:
stats["Year Mismatches"] = year_mismatches
if unable_to_verify > 0:
stats["Unable to Verify"] = unable_to_verify
if self.duplicate_groups:
stats["Duplicate Groups"] = len(self.duplicate_groups)
return stats
def _generate_issues_section(self) -> List[str]:
lines = ["## ⚠️ Critical Issues Detected", ""]
has_any = False
if self.duplicate_groups:
has_any = True
lines.append("### πŸ”„ Duplicate Entries")
for i, group in enumerate(self.duplicate_groups, 1):
lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})")
lines.append(f"**Reason:** {group.reason}")
lines.append("")
lines.append("| Key | Title | Year |")
lines.append("|-----|-------|------|")
for entry in group.entries:
lines.append(f"| `{entry.key}` | {entry.title} | {entry.year} |")
lines.append("")
issue_entries = [e for e in self.entries if self._has_issues(e)]
if issue_entries:
has_any = True
lines.append("### ⚠️ Metadata Issues")
for report in issue_entries:
lines.extend(self._format_entry_detail(report, is_verified=False))
if not has_any:
lines.append("πŸŽ‰ **No critical issues found!**")
return lines
def _generate_verified_section(self) -> List[str]:
lines = ["## βœ… Verified Entries", ""]
verified = [e for e in self.entries if self._is_verified(e)]
if not verified:
lines.append("_No verified entries found._")
return lines
lines.append(f"Found **{len(verified)}** entries with correct metadata.")
lines.append("")
lines.append("<details>")
lines.append("<summary>Click to view verified entries</summary>")
lines.append("")
for report in verified:
lines.extend(self._format_entry_detail(report, minimal=self.minimal_verified, is_verified=True))
lines.append("</details>")
return lines
def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> List[str]:
entry = report.entry
comp = report.comparison
lines = []
icon = "βœ…" if is_verified else "⚠️"
lines.append(f"#### {icon} `{entry.key}`")
lines.append(f"**Title:** {entry.title}")
lines.append("")
if comp:
status_icon = "βœ…" if comp.is_match else "❌"
lines.append(f"- **Metadata Status:** {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})")
if comp.has_issues and not minimal:
lines.append(" - **Discrepancies:**")
for issue in comp.issues:
if "Mismatch" in issue or "mismatch" in issue:
lines.append(f" - πŸ”΄ {issue}")
if "Title" in issue:
lines.append(f" - **Bib:** `{comp.bib_title}`")
lines.append(f" - **Fetched:** `{comp.fetched_title}`")
elif "Author" in issue:
lines.append(f" - **Bib:** `{', '.join(comp.bib_authors)}`")
lines.append(f" - **Fetched:** `{', '.join(comp.fetched_authors)}`")
else:
lines.append(f" - πŸ”Έ {issue}")
lines.append("")
lines.append("---")
lines.append("")
return lines
def save_bibliography_report(self, filepath: str):
"""Generate and save bibliography-only report."""
total = len(self.entries)
verified = sum(1 for e in self.entries if self._is_verified(e))
issues = sum(1 for e in self.entries if self._has_issues(e))
dup_str = str(len(self.duplicate_groups)) if self.duplicate_groups else "N/A"
preprint_str = "N/A"
preprint_warning = []
if self.check_preprint_ratio and self.entries:
preprint_count = sum(1 for e in self.entries if self._is_preprint(e.entry))
preprint_ratio = preprint_count / len(self.entries)
preprint_str = f"{preprint_count} ({preprint_ratio:.1%})"
if preprint_ratio > self.preprint_warning_threshold:
preprint_warning = [
"",
f"> ⚠️ **High Preprint Ratio:** {preprint_ratio:.1%} of entries are preprints.",
]
bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A"
lines = [
"# Bibliography Validation Report",
"",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"",
"| File Type | Filename |",
"|-----------|----------|",
f"| **Bib File(s)** | {bib_names} |",
"",
"> **⚠️ Disclaimer:** This report is generated by an automated tool. Please verify reported issues manually.",
"",
"## πŸ“Š Summary",
"",
"| Metric | Count |",
"|--------|-------|",
f"| **Total Entries** | {total} |",
f"| βœ… **Verified (Clean)** | {verified} |",
f"| ⚠️ **With Issues** | {issues} |",
f"| πŸ”„ **Duplicate Groups** | {dup_str} |",
f"| πŸ“„ **Preprints** | {preprint_str} |",
"",
]
if preprint_warning:
lines.extend(preprint_warning)
lines.append("")
lines.extend(self._generate_issues_section())
lines.append("")
lines.extend(self._generate_verified_section())
lines.append("")
lines.append("---")
lines.append(f"Report generated by **CiteScan** on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
with open(filepath, "w", encoding="utf-8") as f:
f.write("\n".join(lines))