Files changed (1) hide show
  1. app.py +322 -101
app.py CHANGED
@@ -1,117 +1,338 @@
1
- import uvicorn
2
- from fastapi import FastAPI, Request
3
- from pydantic import BaseModel
4
- import requests
5
- from bs4 import BeautifulSoup
6
- from transformers import pipeline
7
- import re
8
-
9
- # =========================
10
- # CONFIG
11
- # =========================
12
- NEWS_API_KEY = "9067f24c056541fd937a455293d9ace3" # newsapi.org
13
- GNEWS_API_KEY = "c41717a7b25455cd0937016c539e72d5" # gnews.io
14
- FACT_KEY = "AIzaSyB0A-MIHs8qkjYTWE-TnoLw46KplX-Ihjs" # Google Fact Check
15
- CSE_ID = "727386fd4ef37425d" # Custom Search ID
16
-
17
- app = FastAPI()
18
-
19
- # =========================
20
- # MODELS (Summarization + Sentiment)
21
- # =========================
22
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
23
- classifier = pipeline("text-classification", model="facebook/bart-large-mnli")
24
-
25
- # =========================
26
- # HELPERS
27
- # =========================
28
- def extract_text_from_url(url: str) -> str:
29
- """Fetch article text from a URL."""
30
- try:
31
- html = requests.get(url, timeout=10).text
32
- soup = BeautifulSoup(html, "html.parser")
33
- paragraphs = [p.get_text() for p in soup.find_all("p")]
34
- text = " ".join(paragraphs)
35
- return re.sub(r"\s+", " ", text.strip())
36
- except Exception as e:
37
- return f"Error extracting text: {e}"
38
 
39
- def search_news_sources(query: str):
40
- """Cross-check query with NewsAPI + GNews + Google CSE."""
41
- results = []
 
42
 
43
- # NewsAPI
44
- try:
45
- r = requests.get(f"https://newsapi.org/v2/everything?q={query}&apiKey={NEWS_API_KEY}")
46
- data = r.json()
47
- for a in data.get("articles", []):
48
- results.append({"title": a["title"], "url": a["url"], "source": "NewsAPI"})
49
- except:
50
- pass
51
-
52
- # GNews
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  try:
54
- r = requests.get(f"https://gnews.io/api/v4/search?q={query}&token={GNEWS_API_KEY}")
55
- data = r.json()
56
- for a in data.get("articles", []):
57
- results.append({"title": a["title"], "url": a["url"], "source": "GNews"})
58
- except:
59
- pass
60
-
61
- # Google Custom Search
62
  try:
63
- r = requests.get(
64
- f"https://www.googleapis.com/customsearch/v1?q={query}&key={FACT_KEY}&cx={CSE_ID}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  )
66
- data = r.json()
67
- for item in data.get("items", []):
68
- results.append({"title": item["title"], "url": item["link"], "source": "Google CSE"})
69
- except:
70
- pass
 
 
 
 
 
 
 
 
 
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  return results
73
 
74
- # =========================
75
- # REQUEST SCHEMA
76
- # =========================
77
- class VerifyRequest(BaseModel):
78
- input: str # can be text or url
79
-
80
- # =========================
81
- # MAIN ENDPOINT
82
- # =========================
83
- @app.post("/verify")
84
- async def verify_news(req: VerifyRequest):
85
- user_input = req.input.strip()
86
-
87
- # Step 1: Extract text if URL
88
- if user_input.startswith("http://") or user_input.startswith("https://"):
89
- text = extract_text_from_url(user_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  else:
91
- text = user_input
92
 
93
- if not text or text.startswith("Error"):
94
- return {"error": "Could not extract content", "input": user_input}
 
 
 
95
 
96
- # Step 2: Summarize content
97
- summary = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
98
 
99
- # Step 3: Classify content (is it factual / opinion / fake-sounding?)
100
- classification = classifier(summary)[0]
101
 
102
- # Step 4: Search for supporting evidence
103
- sources = search_news_sources(summary[:80]) # shorten query for better search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Step 5: Construct response
106
- return {
107
- "input": user_input,
108
- "summary": summary,
109
- "classification": classification,
110
- "sources": sources if sources else "No supporting sources found"
111
- }
112
 
113
- # =========================
114
- # RUN SERVER
115
- # =========================
116
  if __name__ == "__main__":
117
- uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
 
 
1
+ import os
2
+ import json
3
+ import asyncio
4
+ import logging
5
+ from datetime import datetime
6
+ from typing import List, Optional, Dict, Any
7
+ import google.generativeai as genai
8
+ import httpx
9
+ from fastapi import FastAPI, HTTPException, Header, Depends
10
+ from pydantic import BaseModel, Field
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ try:
13
+ from newspaper import Article
14
+ except Exception:
15
+ Article = None
16
 
17
+ try:
18
+ from googlesearch import search as google_search
19
+ except Exception:
20
+ google_search = None
21
+
22
+
23
+ # -------------------------
24
+ # Logging setup
25
+ # -------------------------
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format="%(asctime)s [%(levelname)s] %(message)s",
29
+ )
30
+ logger = logging.getLogger("app")
31
+
32
+
33
+ # -------------------------
34
+ # Config
35
+ # -------------------------
36
+ NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
37
+ GNEWS_KEY = os.getenv("GNEWS_KEY")
38
+
39
+ AI_PROVIDER = os.getenv("AI_PROVIDER", "none") # "gemini" or "openai"
40
+ AI_API_KEY = os.getenv("AI_API_KEY")
41
+
42
+ API_KEY = os.getenv("API_KEY", "changeme") # protect your API
43
+
44
+
45
+ # -------------------------
46
+ # Dependencies
47
+ # -------------------------
48
+ def verify_api_key(x_api_key: str = Header(...)):
49
+ if x_api_key != API_KEY:
50
+ raise HTTPException(status_code=401, detail="Invalid or missing API Key")
51
+
52
+
53
+ # -------------------------
54
+ # Helpers
55
+ # -------------------------
56
+ def parse_iso_date(value: str) -> Optional[str]:
57
+ if not value:
58
+ return None
59
  try:
60
+ dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
61
+ return dt.astimezone().isoformat()
62
+ except Exception:
63
+ return None
64
+
65
+
66
+ async def fetch_json(client: httpx.AsyncClient, url: str, params: Dict[str, Any]) -> Dict[str, Any]:
 
67
  try:
68
+ r = await client.get(url, params=params, timeout=15)
69
+ r.raise_for_status()
70
+ return r.json()
71
+ except Exception as e:
72
+ logger.error(f"Error fetching {url}: {e}")
73
+ return {}
74
+
75
+
76
+ # -------------------------
77
+ # Models
78
+ # -------------------------
79
+ class VerifyIn(BaseModel):
80
+ text: str = Field(..., description="Claim text to verify")
81
+ lang: str = Field("en", description="Language (ISO 639-1)")
82
+
83
+
84
+ class ArticleItem(BaseModel):
85
+ title: Optional[str] = None
86
+ url: Optional[str] = None
87
+ source: Optional[str] = None
88
+ publishedAt: Optional[str] = None
89
+ content: Optional[str] = None
90
+ confidence: float = 0.0
91
+
92
+
93
+ class SocialHit(BaseModel):
94
+ title: Optional[str] = None
95
+ url: Optional[str] = None
96
+ source: Optional[str] = None
97
+
98
+
99
+ class Classification(BaseModel):
100
+ category: str
101
+ keywords: List[str] = []
102
+
103
+
104
+ class Verdict(BaseModel):
105
+ verdict: str
106
+ reason: str
107
+ confidence: float
108
+
109
+
110
+ class VerifyOut(BaseModel):
111
+ classification: Classification
112
+ summary: str
113
+ verdict: Verdict
114
+ news: List[ArticleItem] = []
115
+ social: Dict[str, List[SocialHit]] = {}
116
+ timeline: List[ArticleItem] = []
117
+
118
+
119
+ # -------------------------
120
+ # Core Logic
121
+ # -------------------------
122
+ def lightweight_keywords(text: str) -> List[str]:
123
+ import re
124
+
125
+ words = re.findall(r"[A-Za-z]{4,}", text.lower())
126
+ stopwords = set("this that with from into about your they it's dont cant wont very more less most the for and not but or yet so on in at by to of as is are be".split())
127
+ return [w for w in words if w not in stopwords][:12]
128
+
129
+
130
+ async def search_newsapi(query: str, lang: str) -> List[ArticleItem]:
131
+ if not NEWSAPI_KEY:
132
+ return []
133
+ url = "https://newsapi.org/v2/everything"
134
+ params = {"q": query, "language": lang, "pageSize": 10, "sortBy": "relevancy", "apiKey": NEWSAPI_KEY}
135
+ async with httpx.AsyncClient() as client:
136
+ data = await fetch_json(client, url, params)
137
+ return [
138
+ ArticleItem(
139
+ title=a.get("title"),
140
+ url=a.get("url"),
141
+ source=(a.get("source") or {}).get("name"),
142
+ publishedAt=parse_iso_date(a.get("publishedAt")),
143
+ )
144
+ for a in data.get("articles", [])
145
+ ]
146
+
147
+
148
+ async def search_gnews(query: str, lang: str) -> List[ArticleItem]:
149
+ if not GNEWS_KEY:
150
+ return []
151
+ url = "https://gnews.io/api/v4/search"
152
+ params = {"q": query, "lang": lang, "token": GNEWS_KEY, "max": 10}
153
+ async with httpx.AsyncClient() as client:
154
+ data = await fetch_json(client, url, params)
155
+ return [
156
+ ArticleItem(
157
+ title=a.get("title"),
158
+ url=a.get("url"),
159
+ source=(a.get("source") or {}).get("name"),
160
+ publishedAt=parse_iso_date(a.get("publishedAt")),
161
  )
162
+ for a in data.get("articles", [])
163
+ ]
164
+
165
+
166
+ async def fetch_article_body(url: str) -> Optional[str]:
167
+ if not Article:
168
+ return None
169
+ try:
170
+ art = Article(url)
171
+ art.download()
172
+ art.parse()
173
+ return art.text
174
+ except Exception:
175
+ return None
176
+
177
 
178
+ async def score_article_content(text: Optional[str]) -> float:
179
+ if not text:
180
+ return 0.2
181
+ length = len(text)
182
+ if length > 3000:
183
+ return 1.0
184
+ if length > 800:
185
+ return 0.7
186
+ if length > 300:
187
+ return 0.5
188
+ return 0.3
189
+
190
+
191
+ async def gather_social(query: str, limit: int = 5) -> Dict[str, List[SocialHit]]:
192
+ results = {"twitter": [], "reddit": [], "facebook": [], "google_news": []}
193
+ if not google_search:
194
+ return results
195
+ sites = {
196
+ "twitter": "site:twitter.com",
197
+ "reddit": "site:reddit.com",
198
+ "facebook": "site:facebook.com",
199
+ "google_news": "site:news.google.com",
200
+ }
201
+ for key, prefix in sites.items():
202
+ try:
203
+ urls = google_search(f"{prefix} {query}", num=limit, stop=limit)
204
+ results[key] = [SocialHit(url=u, source=key) for u in urls]
205
+ except Exception as e:
206
+ logger.warning(f"Social search failed for {key}: {e}")
207
  return results
208
 
209
+ if AI_PROVIDER == "gemini" and AI_API_KEY:
210
+ genai.configure(api_key=AI_API_KEY)
211
+ gemini_model = genai.GenerativeModel("gemini-2.5-flash")
212
+ else:
213
+ gemini_model = None
214
+
215
+
216
+ async def ai_evaluate(user_text: str, context_articles: List[ArticleItem]) -> Verdict:
217
+ """
218
+ Use Gemini if available, else fallback to rule-based evaluation
219
+ """
220
+ # --- if Gemini enabled ---
221
+ if gemini_model:
222
+ sources_text = "\n".join([f"- {a.title or ''} ({a.url})" for a in context_articles[:10]])
223
+ prompt = f"""
224
+ You are a fact-checking assistant.
225
+ Task: Analyze the following claim and evidence.
226
+ Decide if the claim is True, False, Misleading, or Unverifiable.
227
+ Explain reasoning clearly.
228
+
229
+ Claim: {user_text}
230
+
231
+ Evidence from news:
232
+ {sources_text}
233
+
234
+ Respond with JSON:
235
+ {{
236
+ "verdict": "True/False/Misleading/Unverifiable",
237
+ "reason": "explanation here",
238
+ "confidence": 0.0 to 1.0
239
+ }}
240
+ """
241
+
242
+ try:
243
+ response = gemini_model.generate_content(prompt)
244
+ import json
245
+ data = json.loads(response.text)
246
+ return Verdict(
247
+ verdict=data.get("verdict", "Unclear"),
248
+ reason=data.get("reason", "No reasoning provided"),
249
+ confidence=float(data.get("confidence", 0.5))
250
+ )
251
+ except Exception as e:
252
+ logger.error(f"Gemini evaluation failed: {e}")
253
+
254
+ # --- fallback (rule-based) ---
255
+ sources = len([a for a in context_articles if a.url])
256
+ if sources >= 3:
257
+ verdict, conf = "Likely true", 0.8
258
+ elif sources == 0:
259
+ verdict, conf = "Insufficient evidence", 0.4
260
  else:
261
+ verdict, conf = "Unclear", 0.5
262
 
263
+ return Verdict(
264
+ verdict=verdict,
265
+ reason=f"Fallback evaluation with {sources} sources.",
266
+ confidence=conf,
267
+ )
268
 
 
 
269
 
 
 
270
 
271
+ def make_timeline(items: List[ArticleItem]) -> List[ArticleItem]:
272
+ def keyfn(a: ArticleItem):
273
+ if a.publishedAt:
274
+ try:
275
+ return datetime.fromisoformat(a.publishedAt)
276
+ except Exception:
277
+ return datetime.min
278
+ return datetime.min
279
+ return sorted(items, key=keyfn)
280
+
281
+
282
+ # -------------------------
283
+ # FastAPI App
284
+ # -------------------------
285
+ app = FastAPI(title="OSINT Verification API", version="1.0.0")
286
+
287
+
288
+ @app.get("/health")
289
+ async def health():
290
+ return {"status": "ok"}
291
+
292
+
293
+ @app.post("/verify", response_model=VerifyOut, dependencies=[Depends(verify_api_key)])
294
+ async def verify(payload: VerifyIn):
295
+ text = payload.text.strip()
296
+ if not text:
297
+ raise HTTPException(status_code=400, detail="Empty text")
298
+
299
+ # Step 1: Classification
300
+ kws = lightweight_keywords(text)
301
+ classification = Classification(category="claim", keywords=kws)
302
+
303
+ # Step 2: Gather evidence
304
+ query = " ".join(kws) if kws else text[:200]
305
+ news1, news2, social = await asyncio.gather(
306
+ search_newsapi(query, payload.lang), search_gnews(query, payload.lang), gather_social(query)
307
+ )
308
+ articles = news1 + news2
309
+
310
+ # Step 3: Enrich articles
311
+ async def enrich(item: ArticleItem) -> ArticleItem:
312
+ body = await fetch_article_body(item.url) if item.url else None
313
+ item.content = body
314
+ item.confidence = await score_article_content(body)
315
+ return item
316
+
317
+ enriched = await asyncio.gather(*[enrich(a) for a in articles])
318
+
319
+ # Step 4: AI evaluation
320
+ summary = (text[:200] + "...") if len(text) > 200 else text
321
+ verdict = await ai_evaluate(text, enriched)
322
+
323
+ # Step 5: Timeline
324
+ timeline = make_timeline(enriched)
325
+
326
+ return VerifyOut(
327
+ classification=classification,
328
+ summary=summary,
329
+ verdict=verdict,
330
+ news=enriched,
331
+ social=social,
332
+ timeline=timeline,
333
+ )
334
 
 
 
 
 
 
 
 
335
 
 
 
 
336
  if __name__ == "__main__":
337
+ import uvicorn
338
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))