Mr.Phil / utils.py
luciagomez's picture
upload v1 of Dockerfile, rag, app, requirements and utils
c175b07 verified
import os, re, sqlite3, datetime, requests
from pathlib import Path
from typing import Optional, List, Dict
DATA_DIR = Path("data")
PROV_DB = "provenance.db"
# ---------- SQLite provenance ----------
def init_provenance_db(db_path: str = PROV_DB):
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute("""
CREATE TABLE IF NOT EXISTS retrieved_docs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
foundation_id INTEGER NOT NULL,
year INTEGER,
title TEXT,
doc_type TEXT,
file_path TEXT,
source_url TEXT,
fetched_at TEXT DEFAULT CURRENT_TIMESTAMP
)""")
conn.commit(); conn.close()
def log_provenance(foundation_id: int, year: Optional[int], title: str,
doc_type: str, file_path: str, source_url: str,
db_path: str = PROV_DB):
conn = sqlite3.connect(db_path); c = conn.cursor()
c.execute("""INSERT INTO retrieved_docs
(foundation_id, year, title, doc_type, file_path, source_url, fetched_at)
VALUES (?,?,?,?,?,?,?)""",
(foundation_id, year, title, doc_type, file_path, source_url,
datetime.datetime.now().isoformat()))
conn.commit(); conn.close()
# ---------- Filesystem ----------
def safe_filename(name: str) -> str:
name = re.sub(r"[^\w\-. ]+", "_", name)
return re.sub(r"\s+", "_", name).strip("_")
def ensure_foundation_year_dir(fid: int, year: Optional[int]) -> Path:
base = DATA_DIR / f"{fid}_data"
if year: base = base / str(year)
base.mkdir(parents=True, exist_ok=True)
return base
def download_pdf(url: str, save_dir: Path, preferred_name: Optional[str] = None) -> str:
filename = preferred_name or url.split("/")[-1].split("?")[0]
if not filename.lower().endswith(".pdf"):
filename += ".pdf"
filename = safe_filename(filename)
target = save_dir / filename
r = requests.get(url, stream=True, timeout=30); r.raise_for_status()
with open(target, "wb") as f:
for chunk in r.iter_content(8192):
if chunk: f.write(chunk)
return str(target)
# ---------- SerpAPI search ----------
def serpapi_search(query: str, num_results: int = 20, serpapi_key: Optional[str] = None) -> List[Dict]:
key = serpapi_key or os.getenv("SERPAPI_KEY")
if not key:
raise RuntimeError("SERPAPI_KEY not set (add it in HF Space Secrets).")
params = {"engine": "google", "q": query, "num": num_results, "api_key": key}
resp = requests.get("https://serpapi.com/search", params=params, timeout=20)
resp.raise_for_status()
return resp.json().get("organic_results", [])
def _is_pdf_link(link: str) -> bool:
l = link.lower()
return l.endswith(".pdf") or (".pdf" in l)
def score_candidate(item: Dict, foundation_name: str, year: Optional[int]) -> float:
title = (item.get("title") or "").lower()
link = (item.get("link") or "").lower()
score = 0.0
if any(k in title for k in ["annual", "report", "jahresbericht", "rapport", "rapport annuel"]): score += 2
if foundation_name.lower()[:10] in title or foundation_name.lower()[:10] in link: score += 1.5
if year and (str(year) in title or str(year) in link): score += 1.5
if _is_pdf_link(link): score += 1.0
return score
def find_best_report_url(foundation_name: str, year: Optional[int], extra_terms: Optional[str], serpapi_key: Optional[str]) -> Optional[Dict]:
q = f'{foundation_name} annual report'
if year: q += f' {year}'
if extra_terms: q += f' {extra_terms}'
q += ' filetype:pdf site:org | site:ch | site:foundation | site:stiftung | site:fondation'
results = serpapi_search(q, num_results=20, serpapi_key=serpapi_key)
if not results: return None
ranked = sorted(results, key=lambda r: score_candidate(r, foundation_name, year), reverse=True)
return ranked[0]