Spaces:

yancan
/

CiteScan

Runtime error

aivolcano

Initial commit

350babd about 2 months ago

9.63 kB

	from dataclasses import dataclass
	from datetime import datetime
	from typing import Optional, List
	from pathlib import Path

	from ..parsers.bib_parser import BibEntry
	from ..analyzers.metadata_comparator import ComparisonResult
	from ..analyzers.duplicate_detector import DuplicateGroup


	@dataclass
	class EntryReport:
	"""Report for a single bib entry (bib-only: entry + comparison)."""
	entry: BibEntry
	comparison: Optional[ComparisonResult]


	class ReportGenerator:
	"""Generates bibliography-only markdown reports."""

	def __init__(
	self,
	minimal_verified: bool = False,
	check_preprint_ratio: bool = True,
	preprint_warning_threshold: float = 0.50,
	):
	self.entries: List[EntryReport] = []
	self.duplicate_groups: Optional[List[DuplicateGroup]] = None
	self.bib_files: List[str] = []
	self.minimal_verified = minimal_verified
	self.check_preprint_ratio = check_preprint_ratio
	self.preprint_warning_threshold = preprint_warning_threshold

	def add_entry_report(self, report: EntryReport):
	self.entries.append(report)

	def set_metadata(self, bib_files: str \| List[str], tex_files: str \| List[str] = None):
	if isinstance(bib_files, str):
	self.bib_files = [bib_files]
	else:
	self.bib_files = list(bib_files) if bib_files else []

	def set_duplicate_groups(self, groups: List[DuplicateGroup]):
	self.duplicate_groups = groups

	def _is_verified(self, entry: EntryReport) -> bool:
	return not self._has_issues(entry)

	def _has_issues(self, entry: EntryReport) -> bool:
	return bool(entry.comparison and entry.comparison.has_issues)

	def _is_preprint(self, entry: BibEntry) -> bool:
	preprint_keywords = [
	"arxiv", "biorxiv", "medrxiv", "ssrn", "preprint",
	"openreview", "techreport", "technical report", "working paper",
	]
	if entry.entry_type.lower() in ["techreport", "unpublished", "misc"]:
	text = " ".join([
	entry.journal.lower(), entry.booktitle.lower(),
	entry.publisher.lower(), entry.entry_type.lower(),
	])
	if any(k in text for k in preprint_keywords):
	return True
	if entry.has_arxiv:
	return True
	venue = " ".join([entry.journal.lower(), entry.booktitle.lower(), entry.publisher.lower()])
	return any(k in venue for k in preprint_keywords)

	def get_summary_stats(self) -> dict:
	"""Return bibliography issue counts only (no LaTeX)."""
	total = len(self.entries)
	title_mismatches = author_mismatches = year_mismatches = unable_to_verify = 0
	for e in self.entries:
	if not e.comparison:
	continue
	if e.comparison.has_issues:
	for issue in e.comparison.issues:
	if "Title mismatch" in issue:
	title_mismatches += 1
	elif "Author mismatch" in issue:
	author_mismatches += 1
	elif "Year mismatch" in issue:
	year_mismatches += 1
	elif "Unable to find" in issue:
	unable_to_verify += 1

	stats = {}
	if title_mismatches > 0:
	stats["Title Mismatches"] = title_mismatches
	if author_mismatches > 0:
	stats["Author Mismatches"] = author_mismatches
	if year_mismatches > 0:
	stats["Year Mismatches"] = year_mismatches
	if unable_to_verify > 0:
	stats["Unable to Verify"] = unable_to_verify
	if self.duplicate_groups:
	stats["Duplicate Groups"] = len(self.duplicate_groups)
	return stats

	def _generate_issues_section(self) -> List[str]:
	lines = ["## ⚠️ Critical Issues Detected", ""]
	has_any = False

	if self.duplicate_groups:
	has_any = True
	lines.append("### 🔄 Duplicate Entries")
	for i, group in enumerate(self.duplicate_groups, 1):
	lines.append(f"#### Group {i} (Similarity: {group.similarity_score:.0%})")
	lines.append(f"Reason: {group.reason}")
	lines.append("")
	lines.append("\| Key \| Title \| Year \|")
	lines.append("\|-----\|-------\|------\|")
	for entry in group.entries:
	lines.append(f"\| `{entry.key}` \| {entry.title} \| {entry.year} \|")
	lines.append("")

	issue_entries = [e for e in self.entries if self._has_issues(e)]
	if issue_entries:
	has_any = True
	lines.append("### ⚠️ Metadata Issues")
	for report in issue_entries:
	lines.extend(self._format_entry_detail(report, is_verified=False))

	if not has_any:
	lines.append("🎉 No critical issues found!")
	return lines

	def _generate_verified_section(self) -> List[str]:
	lines = ["## ✅ Verified Entries", ""]
	verified = [e for e in self.entries if self._is_verified(e)]
	if not verified:
	lines.append("_No verified entries found._")
	return lines
	lines.append(f"Found {len(verified)} entries with correct metadata.")
	lines.append("")
	lines.append("<details>")
	lines.append("<summary>Click to view verified entries</summary>")
	lines.append("")
	for report in verified:
	lines.extend(self._format_entry_detail(report, minimal=self.minimal_verified, is_verified=True))
	lines.append("</details>")
	return lines

	def _format_entry_detail(self, report: EntryReport, minimal: bool = False, is_verified: bool = False) -> List[str]:
	entry = report.entry
	comp = report.comparison
	lines = []
	icon = "✅" if is_verified else "⚠️"
	lines.append(f"#### {icon} `{entry.key}`")
	lines.append(f"Title: {entry.title}")
	lines.append("")
	if comp:
	status_icon = "✅" if comp.is_match else "❌"
	lines.append(f"- Metadata Status: {status_icon} {comp.source.upper()} (Confidence: {comp.confidence:.1%})")
	if comp.has_issues and not minimal:
	lines.append(" - Discrepancies:")
	for issue in comp.issues:
	if "Mismatch" in issue or "mismatch" in issue:
	lines.append(f" - 🔴 {issue}")
	if "Title" in issue:
	lines.append(f" - Bib: `{comp.bib_title}`")
	lines.append(f" - Fetched: `{comp.fetched_title}`")
	elif "Author" in issue:
	lines.append(f" - Bib: `{', '.join(comp.bib_authors)}`")
	lines.append(f" - Fetched: `{', '.join(comp.fetched_authors)}`")
	else:
	lines.append(f" - 🔸 {issue}")
	lines.append("")
	lines.append("---")
	lines.append("")
	return lines

	def save_bibliography_report(self, filepath: str):
	"""Generate and save bibliography-only report."""
	total = len(self.entries)
	verified = sum(1 for e in self.entries if self._is_verified(e))
	issues = sum(1 for e in self.entries if self._has_issues(e))
	dup_str = str(len(self.duplicate_groups)) if self.duplicate_groups else "N/A"

	preprint_str = "N/A"
	preprint_warning = []
	if self.check_preprint_ratio and self.entries:
	preprint_count = sum(1 for e in self.entries if self._is_preprint(e.entry))
	preprint_ratio = preprint_count / len(self.entries)
	preprint_str = f"{preprint_count} ({preprint_ratio:.1%})"
	if preprint_ratio > self.preprint_warning_threshold:
	preprint_warning = [
	"",
	f"> ⚠️ High Preprint Ratio: {preprint_ratio:.1%} of entries are preprints.",
	]

	bib_names = ", ".join([f"`{Path(f).name}`" for f in self.bib_files]) if self.bib_files else "N/A"
	lines = [
	"# Bibliography Validation Report",
	"",
	f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
	"",
	"\| File Type \| Filename \|",
	"\|-----------\|----------\|",
	f"\| Bib File(s) \| {bib_names} \|",
	"",
	"> ⚠️ Disclaimer: This report is generated by an automated tool. Please verify reported issues manually.",
	"",
	"## 📊 Summary",
	"",
	"\| Metric \| Count \|",
	"\|--------\|-------\|",
	f"\| Total Entries \| {total} \|",
	f"\| ✅ Verified (Clean) \| {verified} \|",
	f"\| ⚠️ With Issues \| {issues} \|",
	f"\| 🔄 Duplicate Groups \| {dup_str} \|",
	f"\| 📄 Preprints \| {preprint_str} \|",
	"",
	]
	if preprint_warning:
	lines.extend(preprint_warning)
	lines.append("")
	lines.extend(self._generate_issues_section())
	lines.append("")
	lines.extend(self._generate_verified_section())
	lines.append("")
	lines.append("---")
	lines.append(f"Report generated by CiteScan on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

	with open(filepath, "w", encoding="utf-8") as f:
	f.write("\n".join(lines))