Spaces:

Pragthedon
/

proofly

Running

App Files Files Community

proofly / api_wrapper.py

Pragthedon

Initial backend API deployment

4f48a4e 5 days ago

raw

history blame contribute delete

11.1 kB

	# ==========================================
	# API WRAPPER FOR FLASK
	# ==========================================
	from project.database import get_total_evidence_count, load_all_evidence

	def run_fact_check_api(claim):
	"""
	API-friendly version that returns structured data instead of printing.
	Returns dict with evidence, NLI results, and metadata.

	Note: This is a simplified version for demo. For full functionality,
	install all dependencies from requirements.txt
	"""
	try:
	# Try to import the model
	from model import (
	init_db, clear_db, embed_model, fetch_rss, fetch_gdelt, fetch_newsapi,
	fetch_wikipedia, fetch_duckduckgo, fetch_knowledge_base, fetch_wikidata,
	build_faiss, load_all_evidence, nli_model, FAISS_FILE
	)
	import faiss

	# Full implementation
	init_db()
	# clear_db() - Removed to allow accumulation of facts

	claim_emb = embed_model.encode([claim], normalize_embeddings=True)

	# 1. Static knowledge base (offline, always runs first)
	kb_count = fetch_knowledge_base(claim, claim_emb)

	# ── Quick KB short-circuit ──────────────────────────────────────
	# If KB already found strong matches, build a temporary FAISS and
	# check the best similarity score. If it's high (≥ 0.65) we have
	# enough reliable evidence — skip the slow live fetches entirely.
	kb_short_circuit = False
	if kb_count >= 1:
	if build_faiss():
	_idx = faiss.read_index(FAISS_FILE)
	_D, _ = _idx.search(claim_emb, 1)
	if len(_D[0]) > 0 and _D[0][0] >= 0.65:
	kb_short_circuit = True
	print(f"[KnowledgeBase] Strong match (score={_D[0][0]:.2f}) — skipping live fetches.")
	# ───────────────────────────────────────────────────────────────

	# 2. Wikidata entity search (fast, no API key — always runs)
	fetch_wikidata(claim, claim_emb)

	# ── Database Evidence Search (Vector Cache) ───────────────────
	# Before doing slow live scraping, check if our database already has
	# highly relevant evidence from previous fact-checks of similar topics.
	local_evidence_found = False
	if not kb_short_circuit and build_faiss():
	_idx = faiss.read_index(FAISS_FILE)
	if _idx.ntotal > 0:
	_D, _ = _idx.search(claim_emb, 1)
	if len(_D[0]) > 0 and _D[0][0] >= 0.60:
	local_evidence_found = True
	print(f"[VectorCache] Strong local evidence found (score={_D[0][0]:.2f}) — skipping live scrapes.")
	# ───────────────────────────────────────────────────────────────

	# 3. Live fetches — skipped when KB or local DB already has strong matches
	gdelt_count = 0
	newsapi_count = 0
	if not kb_short_circuit and not local_evidence_found:
	fetch_rss(claim_emb)
	gdelt_count = fetch_gdelt(claim, claim_emb)
	newsapi_count = fetch_newsapi(claim, claim_emb)
	fetch_wikipedia(claim)


	# Count evidence
	total_count = get_total_evidence_count()

	activate_fallback = False

	if (gdelt_count + newsapi_count) == 0 or total_count < 3:
	activate_fallback = True

	faiss_ready = build_faiss()

	if faiss_ready:
	index = faiss.read_index(FAISS_FILE)
	D, _ = index.search(claim_emb, 1)
	if len(D) > 0 and len(D[0]) > 0 and D[0][0] < 0.50:
	activate_fallback = True

	if activate_fallback:
	fetch_duckduckgo(claim, claim_emb)
	faiss_ready = build_faiss()

	if not faiss_ready:
	return {
	"success": False,
	"error": "No relevant evidence found.",
	"evidence": [],
	"nli_results": []
	}

	index = faiss.read_index(FAISS_FILE)
	# Search wider first (10 items), then de-duplicate
	top_k = min(10, index.ntotal)
	D, I = index.search(claim_emb, top_k)

	rows = load_all_evidence()

	# De-duplicate by text content and apply minimum similarity threshold
	seen_texts = set()
	unique_indices = []
	unique_scores = []
	for sim_score, row_idx in zip(D[0], I[0]):
	if row_idx >= len(rows):
	continue
	txt = rows[row_idx][1][:100] # key by first 100 chars
	if txt not in seen_texts and sim_score >= 0.50:
	seen_texts.add(txt)
	unique_indices.append(row_idx)
	unique_scores.append(sim_score)
	if len(unique_indices) >= 5:
	break
	evidence_list = []
	for i, idx in enumerate(unique_indices):
	# rows[idx] contains (id, text, source, embedding_json)
	evidence_list.append({
	"text": rows[idx][1],
	"source": rows[idx][2],
	"similarity": float(unique_scores[i])
	})

	# Build NLI results (track similarity index for weighted voting)
	nli_results = []
	for i, idx in enumerate(unique_indices):
	evidence_text = rows[idx][1]
	sim_weight = float(unique_scores[i]) # FAISS cosine similarity
	try:
	def get_core_claim(c):
	"""Strip trailing prepositional qualifiers like 'in 2024', 'currently'
	that confuse literal NLI matching — but NOT location qualifiers that
	are part of the claim's meaning (e.g. 'at sea level')."""
	import re
	stripped = re.sub(
	r'\s+(in\s+\d{4}\|since\s+\w+\|currently\|right now\|nowadays\|as of \w+)$',
	'', c.strip(), flags=re.IGNORECASE
	)
	return stripped if stripped != c else c

	# Run NLI with the raw claim — this is always the primary result
	r1 = nli_model(evidence_text, text_pair=claim)
	label1 = r1[0].get("label", "neutral")
	score1 = float(r1[0].get("score", 0.0))

	# Only try the simplified core-claim if the raw result is neutral
	# (prevents stripping from flipping a correct entailment to contradiction)
	if label1 == "neutral":
	core = get_core_claim(claim)
	if core != claim:
	r2 = nli_model(evidence_text, text_pair=core)
	label2 = r2[0].get("label", "neutral")
	score2 = float(r2[0].get("score", 0.0))
	if label2 != "neutral" and score2 > score1:
	label1, score1 = label2, score2

	nli_results.append({
	"evidence": evidence_text[:200],
	"label": label1,
	"score": score1,
	"similarity": sim_weight
	})
	except Exception as e:
	print(f"[WARNING] NLI error: {e}")


	# ── Similarity-Weighted Verdict ───────────────────────────────────────
	# Uses the strongest evidence to avoid high-quality sources being
	# outvoted by a higher quantity of lower-quality noisy sources.
	verdict = "Uncertain"
	confidence = 0.0

	if nli_results:
	best_entail = max(
	([r['score'] * r['similarity'] for r in nli_results if 'entail' in r['label'].lower()] + [0.0])
	)
	best_contra = max(
	([r['score'] * r['similarity'] for r in nli_results if 'contradict' in r['label'].lower()] + [0.0])
	)

	print(f"[Verdict] best entail={best_entail:.3f} contra={best_contra:.3f}")

	if best_entail > best_contra and best_entail >= 0.20:
	verdict = "True"
	confidence = best_entail
	elif best_contra > best_entail and best_contra >= 0.20:
	verdict = "False"
	confidence = best_contra
	else:
	verdict = "Mixture/Uncertain"
	confidence = max(best_entail, best_contra)

	return {
	"success": True,
	"claim": claim,
	"verdict": verdict,
	"confidence": round(confidence, 2),
	"evidence": evidence_list,
	"nli_results": nli_results,
	"total_evidence": len(evidence_list)
	}

	except ImportError as e:
	print(f"DEBUG: ImportError in api_wrapper: {e}")
	# Return demo data if dependencies are missing
	return {
	"success": True,
	"claim": claim,
	"evidence": [
	{
	"text": "This is demo evidence from RSS feed. Install dependencies from requirements.txt for real fact-checking.",
	"source": "RSS",
	"similarity": 0.85
	},
	{
	"text": "This is demo evidence from GDELT. The full system searches multiple news sources and databases.",
	"source": "GDELT",
	"similarity": 0.78
	},
	{
	"text": "This is demo evidence from Wikipedia. Install all dependencies to enable real-time fact verification.",
	"source": "Wikipedia",
	"similarity": 0.72
	}
	],
	"nli_results": [
	{
	"evidence": "Demo evidence showing entailment (supports the claim)",
	"label": "entailment",
	"score": 0.89
	},
	{
	"evidence": "Demo evidence showing neutral stance",
	"label": "neutral",
	"score": 0.65
	},
	{
	"evidence": "Demo evidence showing contradiction",
	"label": "contradiction",
	"score": 0.45
	}
	],
	"total_evidence": 3
	}

	except Exception as e:
	print(f"DEBUG: General Exception in api_wrapper: {e}")
	import traceback
	traceback.print_exc()
	return {
	"success": False,
	"error": str(e),
	"evidence": [],
	"nli_results": []
	}