proofly / api_wrapper.py
Pragthedon's picture
Initial backend API deployment
4f48a4e
# ==========================================
# API WRAPPER FOR FLASK
# ==========================================
from project.database import get_total_evidence_count, load_all_evidence
def run_fact_check_api(claim):
"""
API-friendly version that returns structured data instead of printing.
Returns dict with evidence, NLI results, and metadata.
Note: This is a simplified version for demo. For full functionality,
install all dependencies from requirements.txt
"""
try:
# Try to import the model
from model import (
init_db, clear_db, embed_model, fetch_rss, fetch_gdelt, fetch_newsapi,
fetch_wikipedia, fetch_duckduckgo, fetch_knowledge_base, fetch_wikidata,
build_faiss, load_all_evidence, nli_model, FAISS_FILE
)
import faiss
# Full implementation
init_db()
# clear_db() - Removed to allow accumulation of facts
claim_emb = embed_model.encode([claim], normalize_embeddings=True)
# 1. Static knowledge base (offline, always runs first)
kb_count = fetch_knowledge_base(claim, claim_emb)
# ── Quick KB short-circuit ──────────────────────────────────────
# If KB already found strong matches, build a temporary FAISS and
# check the best similarity score. If it's high (β‰₯ 0.65) we have
# enough reliable evidence β€” skip the slow live fetches entirely.
kb_short_circuit = False
if kb_count >= 1:
if build_faiss():
_idx = faiss.read_index(FAISS_FILE)
_D, _ = _idx.search(claim_emb, 1)
if len(_D[0]) > 0 and _D[0][0] >= 0.65:
kb_short_circuit = True
print(f"[KnowledgeBase] Strong match (score={_D[0][0]:.2f}) β€” skipping live fetches.")
# ───────────────────────────────────────────────────────────────
# 2. Wikidata entity search (fast, no API key β€” always runs)
fetch_wikidata(claim, claim_emb)
# ── Database Evidence Search (Vector Cache) ───────────────────
# Before doing slow live scraping, check if our database already has
# highly relevant evidence from previous fact-checks of similar topics.
local_evidence_found = False
if not kb_short_circuit and build_faiss():
_idx = faiss.read_index(FAISS_FILE)
if _idx.ntotal > 0:
_D, _ = _idx.search(claim_emb, 1)
if len(_D[0]) > 0 and _D[0][0] >= 0.60:
local_evidence_found = True
print(f"[VectorCache] Strong local evidence found (score={_D[0][0]:.2f}) β€” skipping live scrapes.")
# ───────────────────────────────────────────────────────────────
# 3. Live fetches β€” skipped when KB or local DB already has strong matches
gdelt_count = 0
newsapi_count = 0
if not kb_short_circuit and not local_evidence_found:
fetch_rss(claim_emb)
gdelt_count = fetch_gdelt(claim, claim_emb)
newsapi_count = fetch_newsapi(claim, claim_emb)
fetch_wikipedia(claim)
# Count evidence
total_count = get_total_evidence_count()
activate_fallback = False
if (gdelt_count + newsapi_count) == 0 or total_count < 3:
activate_fallback = True
faiss_ready = build_faiss()
if faiss_ready:
index = faiss.read_index(FAISS_FILE)
D, _ = index.search(claim_emb, 1)
if len(D) > 0 and len(D[0]) > 0 and D[0][0] < 0.50:
activate_fallback = True
if activate_fallback:
fetch_duckduckgo(claim, claim_emb)
faiss_ready = build_faiss()
if not faiss_ready:
return {
"success": False,
"error": "No relevant evidence found.",
"evidence": [],
"nli_results": []
}
index = faiss.read_index(FAISS_FILE)
# Search wider first (10 items), then de-duplicate
top_k = min(10, index.ntotal)
D, I = index.search(claim_emb, top_k)
rows = load_all_evidence()
# De-duplicate by text content and apply minimum similarity threshold
seen_texts = set()
unique_indices = []
unique_scores = []
for sim_score, row_idx in zip(D[0], I[0]):
if row_idx >= len(rows):
continue
txt = rows[row_idx][1][:100] # key by first 100 chars
if txt not in seen_texts and sim_score >= 0.50:
seen_texts.add(txt)
unique_indices.append(row_idx)
unique_scores.append(sim_score)
if len(unique_indices) >= 5:
break
evidence_list = []
for i, idx in enumerate(unique_indices):
# rows[idx] contains (id, text, source, embedding_json)
evidence_list.append({
"text": rows[idx][1],
"source": rows[idx][2],
"similarity": float(unique_scores[i])
})
# Build NLI results (track similarity index for weighted voting)
nli_results = []
for i, idx in enumerate(unique_indices):
evidence_text = rows[idx][1]
sim_weight = float(unique_scores[i]) # FAISS cosine similarity
try:
def get_core_claim(c):
"""Strip trailing prepositional qualifiers like 'in 2024', 'currently'
that confuse literal NLI matching β€” but NOT location qualifiers that
are part of the claim's meaning (e.g. 'at sea level')."""
import re
stripped = re.sub(
r'\s+(in\s+\d{4}|since\s+\w+|currently|right now|nowadays|as of \w+)$',
'', c.strip(), flags=re.IGNORECASE
)
return stripped if stripped != c else c
# Run NLI with the raw claim β€” this is always the primary result
r1 = nli_model(evidence_text, text_pair=claim)
label1 = r1[0].get("label", "neutral")
score1 = float(r1[0].get("score", 0.0))
# Only try the simplified core-claim if the raw result is neutral
# (prevents stripping from flipping a correct entailment to contradiction)
if label1 == "neutral":
core = get_core_claim(claim)
if core != claim:
r2 = nli_model(evidence_text, text_pair=core)
label2 = r2[0].get("label", "neutral")
score2 = float(r2[0].get("score", 0.0))
if label2 != "neutral" and score2 > score1:
label1, score1 = label2, score2
nli_results.append({
"evidence": evidence_text[:200],
"label": label1,
"score": score1,
"similarity": sim_weight
})
except Exception as e:
print(f"[WARNING] NLI error: {e}")
# ── Similarity-Weighted Verdict ───────────────────────────────────────
# Uses the strongest evidence to avoid high-quality sources being
# outvoted by a higher quantity of lower-quality noisy sources.
verdict = "Uncertain"
confidence = 0.0
if nli_results:
best_entail = max(
([r['score'] * r['similarity'] for r in nli_results if 'entail' in r['label'].lower()] + [0.0])
)
best_contra = max(
([r['score'] * r['similarity'] for r in nli_results if 'contradict' in r['label'].lower()] + [0.0])
)
print(f"[Verdict] best entail={best_entail:.3f} contra={best_contra:.3f}")
if best_entail > best_contra and best_entail >= 0.20:
verdict = "True"
confidence = best_entail
elif best_contra > best_entail and best_contra >= 0.20:
verdict = "False"
confidence = best_contra
else:
verdict = "Mixture/Uncertain"
confidence = max(best_entail, best_contra)
return {
"success": True,
"claim": claim,
"verdict": verdict,
"confidence": round(confidence, 2),
"evidence": evidence_list,
"nli_results": nli_results,
"total_evidence": len(evidence_list)
}
except ImportError as e:
print(f"DEBUG: ImportError in api_wrapper: {e}")
# Return demo data if dependencies are missing
return {
"success": True,
"claim": claim,
"evidence": [
{
"text": "This is demo evidence from RSS feed. Install dependencies from requirements.txt for real fact-checking.",
"source": "RSS",
"similarity": 0.85
},
{
"text": "This is demo evidence from GDELT. The full system searches multiple news sources and databases.",
"source": "GDELT",
"similarity": 0.78
},
{
"text": "This is demo evidence from Wikipedia. Install all dependencies to enable real-time fact verification.",
"source": "Wikipedia",
"similarity": 0.72
}
],
"nli_results": [
{
"evidence": "Demo evidence showing entailment (supports the claim)",
"label": "entailment",
"score": 0.89
},
{
"evidence": "Demo evidence showing neutral stance",
"label": "neutral",
"score": 0.65
},
{
"evidence": "Demo evidence showing contradiction",
"label": "contradiction",
"score": 0.45
}
],
"total_evidence": 3
}
except Exception as e:
print(f"DEBUG: General Exception in api_wrapper: {e}")
import traceback
traceback.print_exc()
return {
"success": False,
"error": str(e),
"evidence": [],
"nli_results": []
}