aubynsamuel05 commited on
Commit
f078461
Β·
1 Parent(s): b372940

Fake new dectector with Gradio interface

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .vscode
37
+ .venv
38
+ venv
39
+ **/__pycache__/
40
+ snli_1.0_dev.jsonl
app.py CHANGED
@@ -1,67 +1,20 @@
1
  import os
 
2
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
3
 
4
- from typing import List
5
- import torch
6
- from sentence_transformers import SentenceTransformer, util
7
- from textblob import TextBlob
8
  import gradio as gr
 
9
 
10
- model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
11
- model.eval()
12
-
13
-
14
- def calculate_semantic_similarity(
15
- claim: str, sentences_input: str, similarity_threshold: float = 0.4
16
- ) -> float:
17
- """
18
- Accepts a claim and newline-separated sentences. Returns a weighted similarity score.
19
- """
20
- sentences = [s.strip() for s in sentences_input.split("\n") if s.strip()]
21
- if not sentences:
22
- return 0.0
23
-
24
- all_scores = []
25
-
26
- with torch.no_grad():
27
- claim_embedding = model.encode(claim, show_progress_bar=False)
28
- sentence_embeddings = model.encode(sentences, show_progress_bar=False)
29
- cosine_scores = util.cos_sim(claim_embedding, sentence_embeddings)[0]
30
- claim_sentiment = TextBlob(claim).sentiment.polarity
31
-
32
- for i, sentence in enumerate(sentences):
33
- similarity = cosine_scores[i].item()
34
- sentence_sentiment = TextBlob(sentence).sentiment.polarity
35
-
36
- if claim_sentiment * sentence_sentiment > 0:
37
- similarity *= 1.1
38
- elif claim_sentiment * sentence_sentiment < 0:
39
- similarity *= 0.9
40
-
41
- similarity = max(0.0, min(1.0, similarity))
42
- all_scores.append(similarity)
43
-
44
- supporting_scores = [s for s in all_scores if s >= similarity_threshold]
45
- proportion_supporting = len(supporting_scores) / len(sentences)
46
-
47
- if proportion_supporting >= 0.30:
48
- final_score = sum(supporting_scores) / len(supporting_scores)
49
- else:
50
- final_score = sum(all_scores) / len(all_scores)
51
-
52
- return round(final_score, 4)
53
-
54
 
55
  iface = gr.Interface(
56
- fn=calculate_semantic_similarity,
57
  inputs=[
58
- gr.Textbox(label="Claim"),
59
- gr.Textbox(lines=10, label="Evidence Sentences (one per line)"),
60
- gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.4, label="Similarity Threshold")
61
  ],
62
- outputs=gr.Number(label="Final Weighted Support Score"),
63
- title="Claim Support Checker",
64
- description="Input a claim and evidence sentences to calculate how strongly the evidence supports the claim."
65
  )
66
 
67
  iface.launch()
 
1
  import os
2
+
3
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
4
 
 
 
 
 
5
  import gradio as gr
6
+ from deploy.index import FakeNewsDetector
7
 
8
+ detector = FakeNewsDetector()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  iface = gr.Interface(
11
+ fn=detector.comprehensive_verify,
12
  inputs=[
13
+ gr.Textbox(label="Headline"),
 
 
14
  ],
15
+ outputs=gr.JSON(label="Analysis Result"), # JSON output for structured verdict
16
+ title="Fake News Detector",
17
+ description="Input a headline to check how credible it is.",
18
  )
19
 
20
  iface.launch()
deploy/__init__.py ADDED
File without changes
deploy/index.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
+ import gc
6
+ import time
7
+ import random
8
+ from datetime import datetime
9
+ from typing import Dict, List, Tuple, Any
10
+ import numpy as np
11
+ from googlesearch import search
12
+
13
+ from deploy.main.claim_verifier import ClaimVerifier
14
+ from deploy.main.network_analyzer import NetworkAnalyzer
15
+ from deploy.main.source_credibility_analyzer import SourceCredibilityAnalyzer
16
+ from deploy.utils.general_utils import extract_domain
17
+ from deploy.main.predict_clickbait import ClickbaitPredictor
18
+
19
+ import nltk
20
+
21
+ try:
22
+ nltk.data.find("tokenizers/punkt")
23
+ nltk.data.find("tokenizers/punkt_tab")
24
+ except LookupError:
25
+ nltk.download("punkt")
26
+ nltk.download("punkt_tab")
27
+
28
+
29
+ class FakeNewsDetector:
30
+ """Main enhanced fact checker with ML integration"""
31
+
32
+ def __init__(self):
33
+ try:
34
+ self.source_analyzer = SourceCredibilityAnalyzer()
35
+ self.claim_verifier = ClaimVerifier()
36
+ self.network_analyzer = NetworkAnalyzer()
37
+ self.clickbait_predictor = ClickbaitPredictor()
38
+ except Exception as e:
39
+ print(f"❌ Error initializing components: {e}")
40
+ raise
41
+
42
+ def _to_float(self, value: Any, default: float = 0.0) -> float:
43
+ """Safely convert any numeric value to Python float"""
44
+ try:
45
+ if isinstance(value, (np.integer, np.floating)):
46
+ return float(value)
47
+ elif isinstance(value, (int, float)):
48
+ return float(value)
49
+ else:
50
+ return default
51
+ except (ValueError, TypeError):
52
+ return default
53
+
54
+ def _analyze_clickbait(self, headline: str) -> float:
55
+ """Analyzes the headline for clickbait characteristics."""
56
+ print("🧠 ML Clickbait Analysis...")
57
+ try:
58
+ _, clickbait_score, _ = self.clickbait_predictor.predict(headline)
59
+ clickbait_score = self._to_float(clickbait_score, 0.5)
60
+ print(f" Clickbait Score: {clickbait_score:.2f}")
61
+ return clickbait_score
62
+ except Exception as e:
63
+ print(f" ❌ Clickbait analysis error: {e}")
64
+ return 0.5 # Default moderate score
65
+
66
+ def _search_for_sources(self, headline: str, num_results: int) -> List[str]:
67
+ """Searches the web for sources related to the headline."""
68
+ # print("πŸ”Ž Searching and analyzing sources...")
69
+ try:
70
+ time.sleep(random.uniform(1.5, 3.0))
71
+ search_results = list(search(headline, num_results=num_results, lang="en"))
72
+ # print(f" Found {len(search_results)} search results")
73
+ return search_results
74
+ except Exception as e:
75
+ print(f" ❌ Search error: {e}")
76
+ return []
77
+
78
+ def _analyze_source_credibility(
79
+ self, search_results: List[str]
80
+ ) -> Tuple[float, int, int]:
81
+ """Analyzes the credibility of the found source domains."""
82
+ print("πŸ“Š Analyzing source credibility...")
83
+
84
+ if not search_results:
85
+ print(" ❌ No search results to analyze")
86
+ return 0.1, 0, 0
87
+
88
+ source_scores = []
89
+ trusted_count = 0
90
+ suspicious_count = 0
91
+
92
+ for i, url in enumerate(search_results):
93
+ try:
94
+ domain = extract_domain(url)
95
+ credibility_score = self.source_analyzer.analyze_domain_credibility(
96
+ domain
97
+ )
98
+ credibility_score = self._to_float(credibility_score, 0.5)
99
+ source_scores.append(credibility_score)
100
+
101
+ if credibility_score > 0.7:
102
+ trusted_count += 1
103
+ print(f" {i+1}. {domain} βœ… ({credibility_score:.2f})")
104
+ elif credibility_score < 0.3:
105
+ suspicious_count += 1
106
+ print(f" {i+1}. {domain} ❌ ({credibility_score:.2f})")
107
+ else:
108
+ print(f" {i+1}. {domain} ❓ ({credibility_score:.2f})")
109
+ except Exception as e:
110
+ print(f" ❌ Error analyzing {url}: {e}")
111
+ source_scores.append(0.3) # Default neutral score
112
+
113
+ # Use regular Python mean instead of np.mean
114
+ avg_credibility = (
115
+ sum(source_scores) / len(source_scores) if source_scores else 0.1
116
+ )
117
+ return avg_credibility, trusted_count, suspicious_count
118
+
119
+ def _analyze_network_propagation(
120
+ self, search_results: List[str]
121
+ ) -> Dict[str, float]:
122
+ """Analyzes the propagation pattern of the news across the network."""
123
+ print("🌐 Network Propagation Analysis...")
124
+
125
+ if not search_results:
126
+ print(" ❌ No search results for network analysis")
127
+ return {"score": 0.1, "domain_diversity": 0.0}
128
+
129
+ try:
130
+ network_analysis = self.network_analyzer.analyze_propagation_pattern(
131
+ search_results
132
+ )
133
+
134
+ # Convert all values to Python floats
135
+ result = {
136
+ "score": self._to_float(network_analysis.get("score", 0.1)),
137
+ "domain_diversity": self._to_float(
138
+ network_analysis.get("domain_diversity", 0.0)
139
+ ),
140
+ }
141
+
142
+ print(f" Propagation Score: {result['score']:.2f}")
143
+ print(f" Domain Diversity: {result['domain_diversity']:.2f}")
144
+ return result
145
+ except Exception as e:
146
+ print(f" ❌ Network analysis error: {e}")
147
+ return {"score": 0.1, "domain_diversity": 0.0}
148
+
149
+ def _verify_claim(self, headline: str, search_results: List[str]) -> float:
150
+ """Verifies the claim against the content of the found sources."""
151
+ print("βœ… Verifying Claims...")
152
+
153
+ if not search_results:
154
+ print(" ❌ No search results for claim verification")
155
+ return 0.4
156
+
157
+ try:
158
+ verification = self.claim_verifier.verify_claim_against_sources(
159
+ headline, search_results
160
+ )
161
+ claim_verification_score = self._to_float(verification.get("score", 0.4))
162
+ print(f" '{headline}': {claim_verification_score:.2f}")
163
+ return claim_verification_score
164
+ except Exception as e:
165
+ print(f" ❌ Claim verification error: {e}")
166
+ return 0.4
167
+
168
+ def _calculate_final_score_and_verdict(
169
+ self, component_scores: Dict[str, float]
170
+ ) -> Tuple[float, str, str]:
171
+ """Calculates the final weighted score and determines the verdict."""
172
+ weights = {
173
+ "source_credibility": 0.35,
174
+ "claim_verification": 0.35,
175
+ "network_propagation": 0.20,
176
+ "clickbait_detection": 0.10,
177
+ }
178
+
179
+ final_score = sum(
180
+ component_scores.get(component, 0.0) * weight
181
+ for component, weight in weights.items()
182
+ )
183
+
184
+ if final_score >= 0.75:
185
+ verdict = "Credible β€” Backed by Evidence"
186
+ confidence = "Very High"
187
+ elif final_score >= 0.60:
188
+ verdict = "Likely True β€” Supported by Sources"
189
+ confidence = "High"
190
+ elif final_score >= 0.45:
191
+ verdict = "Unclear β€” Conflicting Information"
192
+ confidence = "Moderate"
193
+ elif final_score >= 0.30:
194
+ verdict = "Doubtful β€” Weak or Biased Evidence"
195
+ confidence = "Low"
196
+ else:
197
+ verdict = "False or Misleading β€” No Basis Found"
198
+ confidence = "Very Low"
199
+
200
+ return final_score, verdict, confidence
201
+
202
+ def _print_summary(self, results: Dict):
203
+ """Prints a formatted summary of the analysis results."""
204
+ final_verdict = results["final_verdict"]
205
+ components = results["components"]
206
+
207
+ print(f"πŸ“ˆ COMPREHENSIVE ANALYSIS RESULTS:")
208
+ print(
209
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
210
+ )
211
+ print(f"🎯 Final Score: {final_verdict['score']:.2f}/1.000")
212
+ print(f"πŸ† Verdict: {final_verdict['verdict']}")
213
+ print(f"πŸ“Š Confidence: {final_verdict['confidence']}")
214
+
215
+ print(f"πŸ” Component Breakdown:")
216
+ for component, score in final_verdict["components"].items():
217
+ print(f" β€’ {component.replace('_', ' ').title()}: {score:.2f}")
218
+
219
+ print(f"πŸ“‹ Summary:")
220
+ print(
221
+ f" β€’ Trusted Sources: {components['source_credibility']['trusted_count']}"
222
+ )
223
+ print(
224
+ f" β€’ Suspicious Sources: {components['source_credibility']['suspicious_count']}"
225
+ )
226
+ print(
227
+ f" β€’ Clickbait Score: {components['clickbait']['score']:.2f} (lower is better)"
228
+ )
229
+ print(f" β€’ Domain Diversity: {components['network']['domain_diversity']:.2f}")
230
+
231
+ def comprehensive_verify(
232
+ self, raw_headline: str, results_to_check: int = 8
233
+ ) -> Dict:
234
+ """
235
+ Comprehensive fact-checking with ML integration.
236
+ This method orchestrates the analysis by calling various specialized components.
237
+ """
238
+ print(f'\nπŸ”Ž Comprehensive Analysis: "{raw_headline}"')
239
+ print("=" * 80)
240
+
241
+ if not raw_headline or not raw_headline.strip():
242
+ print("❌ Empty or invalid headline provided")
243
+ return {
244
+ "headline": "",
245
+ "timestamp": datetime.now().isoformat(),
246
+ "final_verdict": {
247
+ "verdict": "❌ Invalid Input",
248
+ "confidence": "Very High",
249
+ "score": 0.0,
250
+ "components": {
251
+ "claim_verification": 0.0,
252
+ "source_credibility": 0.0,
253
+ "clickbait_detection": 0.0,
254
+ "network_propagation": 0.0,
255
+ },
256
+ },
257
+ "components": {
258
+ "clickbait": {"score": 0.0},
259
+ "source_credibility": {
260
+ "score": 0.0,
261
+ "trusted_count": 0,
262
+ "suspicious_count": 0,
263
+ },
264
+ "network": {"score": 0.0, "domain_diversity": 0.0},
265
+ "claim_verification": {"score": 0.0},
266
+ },
267
+ }
268
+
269
+ # Step 1: Search for sources
270
+ search_results = self._search_for_sources(raw_headline, results_to_check)
271
+
272
+ if not search_results:
273
+ print("⚠️ No search results found. Assigning low credibility by default.")
274
+ return {
275
+ "headline": raw_headline,
276
+ "timestamp": datetime.now().isoformat(),
277
+ "final_verdict": {
278
+ "verdict": "🚫 HIGHLY QUESTIONABLE",
279
+ "confidence": "Very High",
280
+ "score": 0.1,
281
+ "components": {
282
+ "claim_verification": 0.1,
283
+ "source_credibility": 0.1,
284
+ "clickbait_detection": 0.1,
285
+ "network_propagation": 0.1,
286
+ },
287
+ },
288
+ "components": {
289
+ "clickbait": {"score": 0.5},
290
+ "source_credibility": {
291
+ "score": 0.1,
292
+ "trusted_count": 0,
293
+ "suspicious_count": 0,
294
+ },
295
+ "network": {"score": 0.1, "domain_diversity": 0.0},
296
+ "claim_verification": {"score": 0.1},
297
+ },
298
+ }
299
+
300
+ # Step 2: Run all analysis components
301
+ clickbait_score = self._analyze_clickbait(raw_headline)
302
+ avg_source_credibility, trusted_count, suspicious_count = (
303
+ self._analyze_source_credibility(search_results)
304
+ )
305
+ network_analysis = self._analyze_network_propagation(search_results)
306
+ claim_verification_score = self._verify_claim(raw_headline, search_results)
307
+
308
+ # Step 3: Consolidate component scores (ensure all are Python floats)
309
+ component_scores = {
310
+ "claim_verification": claim_verification_score,
311
+ "source_credibility": avg_source_credibility,
312
+ "clickbait_detection": 1.0 - clickbait_score, # Invert score
313
+ "network_propagation": network_analysis["score"],
314
+ }
315
+
316
+ # Step 4: Calculate final score and verdict
317
+ final_score, verdict, confidence = self._calculate_final_score_and_verdict(
318
+ component_scores
319
+ )
320
+
321
+ # Step 5: Build the exact JSON structure you specified
322
+ analysis_results = {
323
+ "headline": raw_headline,
324
+ "timestamp": datetime.now().isoformat(),
325
+ "final_verdict": {
326
+ "verdict": verdict,
327
+ "confidence": confidence,
328
+ "score": round(final_score, 2),
329
+ "components": {
330
+ "claim_verification": round(
331
+ component_scores["claim_verification"], 2
332
+ ),
333
+ "source_credibility": round(
334
+ component_scores["source_credibility"], 2
335
+ ),
336
+ "clickbait_detection": round(
337
+ component_scores["clickbait_detection"], 2
338
+ ),
339
+ "network_propagation": round(
340
+ component_scores["network_propagation"], 2
341
+ ),
342
+ },
343
+ },
344
+ "components": {
345
+ "clickbait": {"score": round(clickbait_score, 2)},
346
+ "source_credibility": {
347
+ "score": round(avg_source_credibility, 2),
348
+ "trusted_count": trusted_count,
349
+ "suspicious_count": suspicious_count,
350
+ },
351
+ "network": {
352
+ "score": round(network_analysis["score"], 2),
353
+ "domain_diversity": round(network_analysis["domain_diversity"], 2),
354
+ },
355
+ "claim_verification": {"score": round(claim_verification_score, 2)},
356
+ },
357
+ }
358
+
359
+ # self._print_summary(analysis_results)
360
+ gc.collect()
361
+ return analysis_results
deploy/main/__init__.py ADDED
File without changes
deploy/main/claim_verifier.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Optional, Tuple
2
+ import logging
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ import hashlib
5
+ from urllib.parse import urlparse
6
+ import warnings
7
+ import re
8
+ from nltk.tokenize import sent_tokenize
9
+ import string
10
+
11
+ from deploy.utils.general_utils import TRUSTED_DOMAINS, SUSPICIOUS_DOMAINS
12
+ from deploy.utils.content_extractor import extract_content
13
+ from deploy.utils.url_filter import _is_corrupted_pdf_content, _is_pdf_or_download_url
14
+ from semantic_similarity import calculate_semantic_similarity
15
+
16
+ warnings.filterwarnings("ignore")
17
+
18
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
19
+
20
+
21
+ class ClaimVerifier:
22
+ """Enhanced claim verifier with smart sentence extraction and prioritized scraping."""
23
+
24
+ def __init__(self, cache_size: int = 500, max_workers: int = 4):
25
+ self.claim_cache: Dict[str, Dict] = {}
26
+ self.content_cache: Dict[str, str] = {}
27
+ self.cache_size = cache_size
28
+ self.max_workers = max_workers
29
+ self.trusted_domains = TRUSTED_DOMAINS
30
+ self.suspicious_domains = SUSPICIOUS_DOMAINS
31
+ self.domain_weights = {"trusted": 2.0, "suspicious": 0.3, "neutral": 1.0}
32
+ self.user_agents = [
33
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
34
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
35
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
36
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0",
37
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
38
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
39
+ "Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.196 Mobile Safari/537.36",
40
+ "Mozilla/5.0 (iPad; CPU OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
41
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/18.18363",
42
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
43
+ ]
44
+ self.current_ua_index = 0
45
+ self.timeout = 10
46
+
47
+ def _get_domain_weight(self, url: str) -> Tuple[float, str]:
48
+ domain = urlparse(url).netloc.lower().replace("www.", "")
49
+ if domain in self.trusted_domains:
50
+ return self.domain_weights["trusted"], "trusted"
51
+ elif domain in self.suspicious_domains:
52
+ return self.domain_weights["suspicious"], "suspicious"
53
+ else:
54
+ return self.domain_weights["neutral"], "neutral"
55
+
56
+ def _prioritize_sources(self, search_results: List[str]) -> List[str]:
57
+ """Prioritize trusted sources and filter out PDFs/downloads."""
58
+ # First, filter out PDFs and download links
59
+ filtered_results = []
60
+ pdf_count = 0
61
+
62
+ for url in search_results:
63
+ if _is_pdf_or_download_url(url):
64
+ pdf_count += 1
65
+ logging.info(f"πŸ“„ Filtered out PDF/download URL: {url}")
66
+ continue
67
+ filtered_results.append(url)
68
+
69
+ if pdf_count > 0:
70
+ logging.info(f"🚫 Filtered out {pdf_count} PDF/download URLs")
71
+
72
+ if not filtered_results:
73
+ logging.warning("⚠️ No valid URLs remaining after filtering PDFs/downloads")
74
+ return []
75
+
76
+ # Then prioritize trusted sources
77
+ trusted_sources = [
78
+ url
79
+ for url in filtered_results
80
+ if self._get_domain_weight(url)[1] == "trusted"
81
+ ]
82
+ other_sources = [
83
+ url
84
+ for url in filtered_results
85
+ if self._get_domain_weight(url)[1] != "trusted"
86
+ ]
87
+
88
+ if len(trusted_sources) >= 4:
89
+ return trusted_sources[:8]
90
+ else:
91
+ return (trusted_sources + other_sources)[:8]
92
+
93
+ def _is_valid_sentence(self, sentence: str) -> bool:
94
+ """Enhanced sentence validation to filter out garbled/corrupted text."""
95
+ sentence = sentence.strip()
96
+
97
+ # Basic length check
98
+ if len(sentence) < 20 or len(sentence) > 300:
99
+ return False
100
+
101
+ # Check for too many non-ASCII characters (garbled text indicator)
102
+ non_ascii_count = sum(1 for c in sentence if ord(c) > 127)
103
+ if non_ascii_count > len(sentence) * 0.3: # More than 30% non-ASCII
104
+ return False
105
+
106
+ # Check for excessive special characters or symbols
107
+ special_chars = sum(
108
+ 1 for c in sentence if c in string.punctuation and c not in ".,!?;:"
109
+ )
110
+ if special_chars > len(sentence) * 0.2: # More than 20% special chars
111
+ return False
112
+
113
+ # Enhanced check for random character patterns (PDF corruption indicators)
114
+ if re.search(r"[^\w\s]{3,}", sentence): # 3+ consecutive non-word chars
115
+ return False
116
+
117
+ # Check for PDF-specific corruption patterns
118
+ if re.search(r"(endstream|endobj|obj\s*<|stream\s+H)", sentence, re.IGNORECASE):
119
+ return False
120
+
121
+ # Check for excessive whitespace or control characters
122
+ if re.search(r"\s{3,}", sentence) or any(
123
+ ord(c) < 32 and c not in "\t\n\r" for c in sentence
124
+ ):
125
+ return False
126
+
127
+ # Check for minimum word count and average word length
128
+ words = sentence.split()
129
+ if len(words) < 4:
130
+ return False
131
+
132
+ # Check for reasonable word lengths (avoid strings like "a b c d e f g")
133
+ avg_word_length = sum(len(word) for word in words) / len(words)
134
+ if avg_word_length < 2.5:
135
+ return False
136
+
137
+ # Check for excessive capitalization
138
+ if sum(1 for c in sentence if c.isupper()) > len(sentence) * 0.5:
139
+ return False
140
+
141
+ # Check for sequences that look like corrupted encoding
142
+ if re.search(r"[^\w\s]{5,}", sentence):
143
+ return False
144
+
145
+ return True
146
+
147
+ def _is_noise_sentence(self, sentence: str) -> bool:
148
+ """Check if a sentence is likely noise (navigation, ads, etc.)."""
149
+ noise_patterns = [
150
+ r"^(click|tap|read|view|see|watch|follow|subscribe)",
151
+ r"(cookie|privacy|terms|conditions|policy)",
152
+ r"(advertisement|sponsored|ad)",
153
+ r"(Β©|copyright|\u00a9)",
154
+ r"^(home|about|contact|menu|search)",
155
+ r"(javascript|enable|browser|update)",
156
+ r"^[\W\d\s]*$",
157
+ r"(share|like|comment|subscribe)",
158
+ r"(login|sign\s+in|register)",
159
+ r"(loading|please\s+wait)",
160
+ # Add PDF-specific noise patterns
161
+ r"(pdf|download|file|document)\s*(viewer|reader)",
162
+ r"(page|pages)\s*\d+\s*(of|\/)\s*\d+",
163
+ r"(adobe|acrobat|reader)",
164
+ ]
165
+ sentence_lower = sentence.lower()
166
+ return any(re.search(pattern, sentence_lower) for pattern in noise_patterns)
167
+
168
+ def _extract_relevant_sentences(self, content: str) -> List[str]:
169
+ """Extract relevant sentences using TF-IDF vectorization."""
170
+ if not content or len(content.strip()) < 50:
171
+ return []
172
+
173
+ # Check if content appears to be corrupted PDF
174
+ if _is_corrupted_pdf_content(content):
175
+ logging.warning("🚫 Content appears to be corrupted PDF - skipping")
176
+ return []
177
+
178
+ sentences = sent_tokenize(content)
179
+
180
+ # Enhanced filtering pipeline
181
+ valid_sentences = []
182
+ for sentence in sentences:
183
+ if self._is_valid_sentence(sentence) and not self._is_noise_sentence(
184
+ sentence
185
+ ):
186
+ valid_sentences.append(sentence.strip())
187
+
188
+ if not valid_sentences:
189
+ logging.warning("No valid sentences found after filtering")
190
+ return []
191
+
192
+ return valid_sentences
193
+
194
+ def _get_user_agent(self) -> str:
195
+ ua = self.user_agents[self.current_ua_index]
196
+ self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)
197
+ return ua
198
+
199
+ def _cache_key(self, text: str) -> str:
200
+ return hashlib.md5(text.encode()).hexdigest()
201
+
202
+ def _add_to_cache(self, key: str, result: Dict):
203
+ if len(self.claim_cache) >= self.cache_size:
204
+ oldest_key = next(iter(self.claim_cache))
205
+ del self.claim_cache[oldest_key]
206
+ self.claim_cache[key] = result
207
+
208
+ def _get_from_cache(self, key: str) -> Optional[Dict]:
209
+ return self.claim_cache.get(key)
210
+
211
+ def _semantic_similarity_with_sentences(
212
+ self, claim: str, sentences: List[str]
213
+ ) -> float:
214
+ """Calculate entailment scores and return the best one."""
215
+ try:
216
+ score = calculate_semantic_similarity(claim, sentences)
217
+ except Exception as e:
218
+ logging.error(f"Error analyzing sentence: {e}")
219
+ return score
220
+
221
+ def verify_claim_against_sources(
222
+ self, claim: str, search_results: List[str]
223
+ ) -> Dict:
224
+ logging.info(f"\nVerifying Claim: '{claim}'...")
225
+
226
+ cache_key = self._cache_key(f"verify_{claim}")
227
+ if cached_result := self._get_from_cache(cache_key):
228
+ logging.info("πŸ“‹ Using cached result")
229
+ return cached_result
230
+
231
+ prioritized_sources = self._prioritize_sources(search_results)
232
+
233
+ if not prioritized_sources:
234
+ logging.warning("⚠️ No valid sources available after filtering")
235
+ return {
236
+ "score": 0.3,
237
+ "total_sources_processed": 0,
238
+ "support_sum": 0.0,
239
+ "total_weight": 0.0,
240
+ "source_details": [],
241
+ "warning": "No valid sources available after filtering PDFs/downloads",
242
+ }
243
+
244
+ support_scores = []
245
+ total_weight = 0.0
246
+ source_details = []
247
+
248
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
249
+ future_to_url = {
250
+ executor.submit(self._analyze_url, url, claim): url
251
+ for url in prioritized_sources
252
+ }
253
+
254
+ try:
255
+ for future in as_completed(future_to_url, timeout=45):
256
+ url = future_to_url[future]
257
+ try:
258
+ if result := future.result(timeout=15):
259
+ similarity_score, domain_weight, domain_type, sentences = (
260
+ result
261
+ )
262
+
263
+ # Enhanced Logging Format
264
+ logging.info(f"\nSource: {url} ({domain_type})")
265
+ # logging.info(
266
+ # f" - Relevant Sentences: {sentences[:3]}"
267
+ # ) # Log first 2 sentences
268
+ logging.info(
269
+ f" - Entailment Score: {similarity_score:.2f}"
270
+ )
271
+
272
+ total_weight += domain_weight
273
+ if similarity_score >= 0.4:
274
+ support_scores.append(similarity_score * domain_weight)
275
+
276
+ source_details.append(
277
+ {
278
+ "url": url,
279
+ "semantic_similarity": similarity_score,
280
+ "domain_weight": domain_weight,
281
+ "domain_type": domain_type,
282
+ "relevant_sentences": sentences[:3],
283
+ }
284
+ )
285
+ except Exception as e:
286
+ logging.error(f"Error processing {url}: {e}")
287
+ except TimeoutError:
288
+ logging.warning("⏰ Timeout: Some URLs were skipped.")
289
+
290
+ support_sum = sum(support_scores)
291
+
292
+ if total_weight > 0:
293
+ final_score = min(1.0, support_sum / len(support_scores))
294
+ # Adjustments
295
+ # if final_score < 0.5 and support_sum < 0.5:
296
+ # final_score *= 0.8
297
+ # elif final_score > 0.5 and support_sum >= 1.0:
298
+ # final_score = min(0.9, final_score * 1.1)
299
+ else:
300
+ final_score = 0.1
301
+
302
+ final_score = max(0.0, min(1.0, final_score))
303
+ logging.info(
304
+ f"\n{'='*20}\n🏁 Final Verification Score: {final_score:.2f}\n{'='*20}"
305
+ )
306
+
307
+ result = {
308
+ "score": final_score,
309
+ "total_sources_processed": len(source_details),
310
+ "support_sum": support_sum,
311
+ "total_weight": total_weight,
312
+ "source_details": source_details,
313
+ }
314
+ self._add_to_cache(cache_key, result)
315
+ return result
316
+
317
+ def _analyze_url(
318
+ self, url: str, claim: str
319
+ ) -> Optional[Tuple[float, float, str, List[str]]]:
320
+ try:
321
+ # Double-check for PDFs at analysis time (in case some slipped through)
322
+ if _is_pdf_or_download_url(url):
323
+ logging.info(f"🚫 Skipping PDF/download URL at analysis time: {url}")
324
+ return None
325
+
326
+ cache_key = self._cache_key(url)
327
+ content = extract_content(
328
+ url,
329
+ self.content_cache,
330
+ cache_key,
331
+ self._get_user_agent,
332
+ self.timeout,
333
+ self.cache_size,
334
+ )
335
+
336
+ if not content or len(content.strip()) < 50:
337
+ return None
338
+
339
+ # Check for corrupted PDF content
340
+ if _is_corrupted_pdf_content(content):
341
+ logging.warning(f"🚫 Skipping corrupted PDF content from: {url}")
342
+ return None
343
+
344
+ # Used for sentence extraction instead of embeddings
345
+ relevant_sentences = self._extract_relevant_sentences(content)
346
+
347
+ if not relevant_sentences:
348
+ return None
349
+
350
+ # cleaned_content = ""
351
+ # for sentence in relevant_sentences:
352
+ # if (
353
+ # sentence.endswith(".")
354
+ # or sentence.endswith("?")
355
+ # or sentence.endswith("!")
356
+ # ):
357
+ # cleaned_content += f"{sentence} "
358
+ # else:
359
+ # cleaned_content += f"{sentence}. "
360
+
361
+ semantic_similarity = self._semantic_similarity_with_sentences(
362
+ claim, relevant_sentences
363
+ )
364
+
365
+ domain_weight, domain_type = self._get_domain_weight(url)
366
+ # print(f"relevant_sentences: {cleaned_content}")
367
+
368
+ return semantic_similarity, domain_weight, domain_type, relevant_sentences
369
+ except Exception as e:
370
+ logging.error(f"Failed to analyze URL {url}: {e}")
371
+ return None
deploy/main/network_analyzer.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from collections import Counter
3
+ from typing import List, Dict
4
+
5
+ from deploy.utils.general_utils import (
6
+ TRUSTED_DOMAINS,
7
+ SUSPICIOUS_DOMAINS,
8
+ extract_domain,
9
+ )
10
+
11
+ SOCIAL_AGGREGATOR_DOMAINS = {
12
+ "facebook.com",
13
+ "twitter.com",
14
+ "reddit.com",
15
+ "youtube.com",
16
+ "instagram.com",
17
+ "tiktok.com",
18
+ "google.com",
19
+ "yahoo.com",
20
+ "msn.com",
21
+ "aol.com",
22
+ "linkedin.com",
23
+ "pinterest.com",
24
+ "snapchat.com",
25
+ "discord.com",
26
+ "telegram.org",
27
+ }
28
+
29
+ CONTENT_FARM_DOMAINS = {
30
+ "buzzfeed.com",
31
+ "clickhole.com",
32
+ "upworthy.com",
33
+ "viralthread.com",
34
+ "shareably.net",
35
+ "littlethings.com",
36
+ "providr.com",
37
+ "shared.com",
38
+ }
39
+
40
+
41
+ class NetworkAnalyzer:
42
+ """Propagation pattern analyzer - returns only score, and domain_diversity"""
43
+
44
+ def __init__(self):
45
+ # Scoring weights
46
+ self.weights = {"domain_credibility": 0.60, "diversity_quality": 0.40}
47
+ self.min_sources_threshold = 3
48
+ self.min_unique_domains = 2
49
+
50
+ def _calculate_domain_credibility_score(self, domains: List[str]) -> float:
51
+ """Calculate domain credibility score"""
52
+ if not domains:
53
+ return 0.0
54
+
55
+ domain_counts = Counter(domains)
56
+ total_sources = len(domains)
57
+
58
+ # Categorize domains
59
+ trusted_count = sum(
60
+ count
61
+ for domain, count in domain_counts.items()
62
+ if domain in TRUSTED_DOMAINS
63
+ )
64
+ suspicious_count = sum(
65
+ count
66
+ for domain, count in domain_counts.items()
67
+ if domain in SUSPICIOUS_DOMAINS
68
+ )
69
+ social_count = sum(
70
+ count
71
+ for domain, count in domain_counts.items()
72
+ if domain in SOCIAL_AGGREGATOR_DOMAINS
73
+ )
74
+ content_farm_count = sum(
75
+ count
76
+ for domain, count in domain_counts.items()
77
+ if domain in CONTENT_FARM_DOMAINS
78
+ )
79
+
80
+ # Calculate ratios
81
+ trusted_ratio = trusted_count / total_sources
82
+ suspicious_ratio = suspicious_count / total_sources
83
+ social_ratio = social_count / total_sources
84
+ content_farm_ratio = content_farm_count / total_sources
85
+ unknown_ratio = 1 - (
86
+ trusted_ratio + suspicious_ratio + social_ratio + content_farm_ratio
87
+ )
88
+
89
+ # Calculate score
90
+ base_score = 0.15
91
+ score = base_score
92
+ score += trusted_ratio * 0.6
93
+ score -= suspicious_ratio * 0.8
94
+ score -= content_farm_ratio * 0.4
95
+ score += social_ratio * 0.1
96
+ score -= unknown_ratio * 0.2
97
+
98
+ # Additional penalties
99
+ if suspicious_ratio > 0.5:
100
+ score -= 0.3
101
+ if trusted_count == 0 and total_sources > 5:
102
+ score -= 0.2
103
+ if content_farm_ratio > 0.4:
104
+ score -= 0.15
105
+
106
+ return max(0.0, min(1.0, score))
107
+
108
+ def _calculate_diversity_quality(self, domains: List[str]) -> Dict:
109
+ """Calculate diversity quality - returns score and entropy
110
+ Entropy here is a statistical measure of domain diversity,
111
+ helping to assess whether a claim’s spread is broad and
112
+ organic or narrow and potentially suspicious.
113
+ """
114
+ if len(domains) < 2:
115
+ return {"score": 0.0, "entropy": 0.0}
116
+
117
+ domain_counts = Counter(domains)
118
+ unique_domains = len(set(domains))
119
+ total_sources = len(domains)
120
+
121
+ # Calculate Shannon entropy
122
+ entropy = 0.0
123
+ for count in domain_counts.values():
124
+ p = count / total_sources
125
+ if p > 0:
126
+ entropy -= p * math.log2(p)
127
+
128
+ # Normalize entropy
129
+ max_entropy = math.log2(unique_domains) if unique_domains > 1 else 0
130
+ normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
131
+
132
+ # Base diversity score
133
+ diversity_score = normalized_entropy
134
+
135
+ # Detect artificial patterns
136
+ max_domain_share = max(domain_counts.values()) / total_sources
137
+
138
+ # Single domain dominance penalty
139
+ if max_domain_share > 0.7 and unique_domains > 3:
140
+ diversity_score -= 0.4
141
+
142
+ # Artificial diversity penalty
143
+ single_mention_domains = sum(
144
+ 1 for count in domain_counts.values() if count == 1
145
+ )
146
+ if single_mention_domains > total_sources * 0.8 and total_sources > 10:
147
+ diversity_score -= 0.3
148
+
149
+ if 0.3 <= normalized_entropy <= 0.8 and unique_domains >= 3:
150
+ diversity_score += 0.2
151
+
152
+ return {
153
+ "score": max(0.0, min(1.0, diversity_score)),
154
+ "entropy": normalized_entropy,
155
+ }
156
+
157
+ def analyze_propagation_pattern(self, search_results: List[str]) -> Dict:
158
+ """Analyze propagation pattern - returns score, and domain_diversity"""
159
+ domains = []
160
+ valid_urls = 0
161
+
162
+ for url in search_results:
163
+ domain = extract_domain(url)
164
+ if domain and domain not in ["", "localhost"]:
165
+ domains.append(domain)
166
+ valid_urls += 1
167
+
168
+ # Early return for insufficient data
169
+ if len(domains) < self.min_sources_threshold:
170
+ return {"score": 0.1, "domain_diversity": 0.0}
171
+
172
+ # Perform analysis
173
+ credibility_score = self._calculate_domain_credibility_score(domains)
174
+ diversity_analysis = self._calculate_diversity_quality(domains)
175
+
176
+ # Calculate weighted final score
177
+ final_score = (
178
+ credibility_score * self.weights["domain_credibility"]
179
+ + diversity_analysis["score"] * self.weights["diversity_quality"]
180
+ )
181
+
182
+ # Additional quality adjustments
183
+ unique_domains = len(set(domains))
184
+ trusted_count = sum(1 for d in domains if d in TRUSTED_DOMAINS)
185
+ suspicious_count = sum(1 for d in domains if d in SUSPICIOUS_DOMAINS)
186
+
187
+ if trusted_count >= 3 and suspicious_count == 0:
188
+ final_score += 0.1
189
+ elif suspicious_count > trusted_count:
190
+ final_score -= 0.15
191
+
192
+ if unique_domains < self.min_unique_domains:
193
+ final_score = min(final_score, 0.3)
194
+
195
+ final_score = max(0.0, min(1.0, final_score))
196
+
197
+ return {
198
+ "score": round(final_score, 3),
199
+ "domain_diversity": round(diversity_analysis["entropy"], 3),
200
+ }
201
+
202
+
203
+ if __name__ == "__main__":
204
+ analyzer = NetworkAnalyzer()
205
+
206
+ # Test Case 1: Mixed credible and suspicious domains
207
+ search_results_1 = [
208
+ "https://reuters.com/news/article1",
209
+ "https://bbc.com/news/article2",
210
+ "https://ghanaweb.com/article3",
211
+ "https://cnn.com/article4",
212
+ "https://naturalnews.com/fake1",
213
+ "https://infowars.com/fake2",
214
+ ]
215
+ print("\nTest Case 1: Mixed credible and suspicious")
216
+ result1 = analyzer.analyze_propagation_pattern(search_results_1)
217
+ print(f"Result: {result1}")
218
+
219
+ # Test Case 2: Mostly trusted domains
220
+ search_results_2 = [
221
+ "https://bbc.com/article",
222
+ "https://cnn.com/article",
223
+ "https://reuters.com/article",
224
+ "https://nytimes.com/article",
225
+ "https://ghanaweb.com/article",
226
+ ]
227
+ print("\nTest Case 2: Mostly trusted domains")
228
+ result2 = analyzer.analyze_propagation_pattern(search_results_2)
229
+ print(f"Result: {result2}")
230
+
231
+ # Test Case 3: Mostly suspicious and content farms
232
+ search_results_3 = [
233
+ "https://infowars.com/fake",
234
+ "https://naturalnews.com/fake",
235
+ "https://clickhole.com/funny",
236
+ "https://upworthy.com/clickbait",
237
+ "https://shared.com/share",
238
+ ]
239
+ print("\nTest Case 3: Suspicious and content farm heavy")
240
+ result3 = analyzer.analyze_propagation_pattern(search_results_3)
241
+ print(f"Result: {result3}")
242
+
243
+ # Test Case 4: Low diversity (same domain repeated)
244
+ search_results_4 = [
245
+ "https://buzzfeed.com/post1",
246
+ "https://buzzfeed.com/post2",
247
+ "https://buzzfeed.com/post3",
248
+ "https://buzzfeed.com/post4",
249
+ "https://buzzfeed.com/post5",
250
+ ]
251
+ print("\nTest Case 4: Low domain diversity")
252
+ result4 = analyzer.analyze_propagation_pattern(search_results_4)
253
+ print(f"Result: {result4}")
254
+
255
+ # Test Case 5: Not enough sources
256
+ search_results_5 = ["https://cnn.com/article1"]
257
+ print("\nTest Case 5: Insufficient results")
258
+ result5 = analyzer.analyze_propagation_pattern(search_results_5)
259
+ print(f"Result: {result5}")
deploy/main/predict_clickbait.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ from deploy.utils.clickbait_utils import extract_enhanced_features
4
+
5
+
6
+ class ClickbaitPredictor:
7
+ def __init__(self, model_dir="./models/clickbait"):
8
+ try:
9
+ with open(f"{model_dir}/logistic_regression_model.pkl", "rb") as f:
10
+ self.classifier = pickle.load(f)
11
+ with open(f"{model_dir}/tfidf_vectorizer.pkl", "rb") as f:
12
+ self.tfidf_vectorizer = pickle.load(f)
13
+ with open(f"{model_dir}/feature_info.pkl", "rb") as f:
14
+ self.clickbait_indicators = pickle.load(f)
15
+ print("Model loaded successfully")
16
+ except Exception as e:
17
+ print(f"Error loading model: {e}")
18
+ self.classifier = None
19
+ self.tfidf_vectorizer = None
20
+ self.clickbait_indicators = None
21
+
22
+ def predict(self, headline, threshold=0.5):
23
+ if self.classifier is None or self.tfidf_vectorizer is None:
24
+ raise RuntimeError("Model or vectorizer not loaded.")
25
+ tfidf_features = self.tfidf_vectorizer.transform([headline])
26
+ handcrafted_features = extract_enhanced_features([headline])
27
+ combined_features = np.hstack((tfidf_features.toarray(), handcrafted_features))
28
+ lr_probs = self.classifier.predict_proba(combined_features)[0]
29
+ lr_score = lr_probs[1]
30
+ is_clickbait = lr_score >= threshold
31
+ confidence = lr_score if is_clickbait else (1 - lr_score)
32
+ return is_clickbait, lr_score, confidence
33
+
34
+
35
+ if __name__ == "__main__":
36
+ predictor = ClickbaitPredictor()
37
+ while True:
38
+ headline = input("Enter a headline to check clickbait score: ")
39
+ is_clickbait, score, confidence = predictor.predict(headline)
40
+ status = "CLICKBAIT" if is_clickbait else "NORMAL"
41
+ print(f"{status} (Score: {score:.3f}, Confidence: {confidence:.3f})")
42
+ print(f" '{headline}'")
43
+ print()
deploy/main/source_credibility_analyzer.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from urllib.parse import urlparse
3
+
4
+ from deploy.utils.general_utils import TRUSTED_DOMAINS, SUSPICIOUS_DOMAINS
5
+
6
+
7
+ class SourceCredibilityAnalyzer:
8
+ """Simplified source credibility analyzer - returns only the score"""
9
+
10
+ def __init__(self):
11
+ # Weighted scoring system
12
+ self.weights = {
13
+ "tld_credibility": 0.4,
14
+ "domain_structure": 0.3,
15
+ "news_indicators": 0.2,
16
+ "domain_age_indicators": 0.15,
17
+ "subdomain_analysis": 0.1,
18
+ }
19
+
20
+ # Suspicious patterns
21
+ self._suspicious_patterns = [
22
+ (re.compile(r"\d{4,}"), 0.8),
23
+ (
24
+ re.compile(r"(fake|hoax|scam|click|bait|spam|phishing)", re.IGNORECASE),
25
+ 0.9,
26
+ ),
27
+ (re.compile(r"[a-z]+\d+[a-z]+\d+", re.IGNORECASE), 0.7),
28
+ (re.compile(r"(xxx|porn|adult|sex)", re.IGNORECASE), 0.6),
29
+ (re.compile(r"(free|download|crack|hack)", re.IGNORECASE), 0.5),
30
+ (re.compile(r"[0-9]{1,3}-[0-9]{1,3}-[0-9]{1,3}", re.IGNORECASE), 0.8),
31
+ (re.compile(r"(temp|tmp|test|demo)", re.IGNORECASE), 0.4),
32
+ ]
33
+
34
+ # TLD scores
35
+ self.tld_scores = {
36
+ ".edu": 0.9,
37
+ ".gov": 0.95,
38
+ ".mil": 0.9,
39
+ ".org": 0.7,
40
+ ".ac.uk": 0.8,
41
+ ".edu.au": 0.8,
42
+ ".com": 0.3,
43
+ ".net": 0.25,
44
+ ".co.uk": 0.4,
45
+ ".com.au": 0.4,
46
+ ".de": 0.4,
47
+ ".fr": 0.4,
48
+ ".ca": 0.4,
49
+ ".jp": 0.4,
50
+ ".info": 0.1,
51
+ ".biz": 0.1,
52
+ ".name": 0.05,
53
+ ".tk": -0.6,
54
+ ".ml": -0.6,
55
+ ".ga": -0.6,
56
+ ".cf": -0.6,
57
+ ".pw": -0.4,
58
+ ".top": -0.3,
59
+ ".click": -0.5,
60
+ ".download": -0.4,
61
+ ".stream": -0.3,
62
+ ".review": -0.2,
63
+ ".date": -0.3,
64
+ ".racing": -0.4,
65
+ }
66
+
67
+ # News indicators
68
+ self.news_indicators = {
69
+ "news": 0.3,
70
+ "times": 0.3,
71
+ "post": 0.25,
72
+ "herald": 0.2,
73
+ "gazette": 0.2,
74
+ "journal": 0.2,
75
+ "tribune": 0.2,
76
+ "chronicle": 0.2,
77
+ "report": 0.15,
78
+ "press": 0.2,
79
+ "media": 0.1,
80
+ "broadcast": 0.15,
81
+ "reuters": 0.4,
82
+ "associated": 0.3,
83
+ "wire": 0.2,
84
+ }
85
+
86
+ def analyze_domain_credibility(self, domain: str) -> float:
87
+ """Get credibility score for domain"""
88
+ domain = domain.lower().strip()
89
+
90
+ # Handle URLs by extracting domain
91
+ if domain.startswith(("http://", "https://")):
92
+ parsed = urlparse(domain)
93
+ domain = parsed.netloc.lower()
94
+
95
+ # Remove www prefix
96
+ if domain.startswith("www."):
97
+ domain = domain[4:]
98
+
99
+ # Check trusted domains
100
+ if domain in TRUSTED_DOMAINS:
101
+ return 0.95
102
+
103
+ # Check suspicious domains
104
+ if domain in SUSPICIOUS_DOMAINS:
105
+ return 0.05
106
+
107
+ # Calculate score components
108
+ tld_score = self._get_tld_score(domain)
109
+ structure_score = self._get_structure_score(domain)
110
+ news_score = self._get_news_score(domain)
111
+ establishment_score = self._get_establishment_score(domain)
112
+ subdomain_score = self._get_subdomain_score(domain)
113
+
114
+ # Start with base score and apply weighted components
115
+ base_score = 0.2
116
+ final_score = base_score
117
+ final_score += tld_score * self.weights["tld_credibility"]
118
+ final_score += structure_score * self.weights["domain_structure"]
119
+ final_score += news_score * self.weights["news_indicators"]
120
+ final_score += establishment_score * self.weights["domain_age_indicators"]
121
+ final_score += subdomain_score * self.weights["subdomain_analysis"]
122
+
123
+ return max(0.0, min(1.0, round(final_score, 2)))
124
+
125
+ def _get_tld_score(self, domain: str) -> float:
126
+ """Get TLD score"""
127
+ for tld, score in self.tld_scores.items():
128
+ if domain.endswith(tld):
129
+ return score
130
+ return -0.1 # Unknown TLD
131
+
132
+ def _get_structure_score(self, domain: str) -> float:
133
+ """Get domain structure score"""
134
+ suspicious_score = 0
135
+
136
+ for pattern, severity in self._suspicious_patterns:
137
+ if pattern.search(domain):
138
+ suspicious_score -= severity * 0.3
139
+
140
+ if len(domain.split(".")[0]) < 3:
141
+ suspicious_score -= 0.2
142
+
143
+ if domain.count("-") > 2:
144
+ suspicious_score -= 0.15
145
+
146
+ return max(-0.8, suspicious_score)
147
+
148
+ def _get_news_score(self, domain: str) -> float:
149
+ """Get news indicators score"""
150
+ score = 0
151
+ for indicator, weight in self.news_indicators.items():
152
+ if indicator in domain:
153
+ score += weight
154
+ return min(0.4, score)
155
+
156
+ def _get_establishment_score(self, domain: str) -> float:
157
+ """Get establishment indicators score"""
158
+ score = 0
159
+
160
+ if any(
161
+ word in domain
162
+ for word in ["university", "college", "institute", "foundation"]
163
+ ):
164
+ score += 0.3
165
+
166
+ if any(word in domain for word in ["library", "museum", "archive"]):
167
+ score += 0.2
168
+
169
+ if any(word in domain for word in ["research", "study", "science"]):
170
+ score += 0.15
171
+
172
+ return min(0.3, score)
173
+
174
+ def _get_subdomain_score(self, domain: str) -> float:
175
+ """Get subdomain score"""
176
+ parts = domain.split(".")
177
+
178
+ if len(parts) <= 2:
179
+ return 0.1
180
+ elif len(parts) > 4:
181
+ return -0.15
182
+ else:
183
+ return 0
184
+
185
+
186
+ if __name__ == "__main__":
187
+ analyzer = SourceCredibilityAnalyzer()
188
+ # domains_to_analyze = ["ghanaweb.com"]
189
+ domain = input("Enter a domain to check credibility: ")
190
+ # for domain in domains_to_analyze:
191
+ result = analyzer.analyze_domain_credibility(domain)
192
+ print(f"{domain} -> {result:.2f}")
deploy/utils/__init__.py ADDED
File without changes
deploy/utils/clickbait_utils.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+
4
+ clickbait_indicators = {
5
+ "curiosity_gap": [
6
+ "you won't believe",
7
+ "wait until you see",
8
+ "what happened next",
9
+ "the reason will shock you",
10
+ "this is why",
11
+ "here's what happened",
12
+ "the truth about",
13
+ "what nobody tells you",
14
+ "finally revealed",
15
+ ],
16
+ "emotional_triggers": [
17
+ "shocking",
18
+ "incredible",
19
+ "amazing",
20
+ "unbelievable",
21
+ "stunning",
22
+ "heartbreaking",
23
+ "hilarious",
24
+ "terrifying",
25
+ "adorable",
26
+ "outrageous",
27
+ "mind-blowing",
28
+ "jaw-dropping",
29
+ "breathtaking",
30
+ ],
31
+ "urgency_scarcity": [
32
+ "breaking",
33
+ "urgent",
34
+ "limited time",
35
+ "before it's too late",
36
+ "act now",
37
+ "don't miss",
38
+ "last chance",
39
+ "expires soon",
40
+ ],
41
+ "personal_relevance": [
42
+ "in your area",
43
+ "people like you",
44
+ "your age",
45
+ "based on your",
46
+ "you need to know",
47
+ "this affects you",
48
+ "for people who",
49
+ ],
50
+ "superlatives": [
51
+ "ultimate",
52
+ "perfect",
53
+ "best ever",
54
+ "greatest",
55
+ "worst",
56
+ "most amazing",
57
+ "incredible",
58
+ "unmatched",
59
+ "revolutionary",
60
+ ],
61
+ "numbers_lists": [
62
+ r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
63
+ r"one\s+(weird|simple|amazing)\s+trick",
64
+ r"\d+\s+minute[s]?",
65
+ r"in\s+\d+\s+(steps?|minutes?|days?)",
66
+ ],
67
+ "authority_social_proof": [
68
+ "doctors hate",
69
+ "experts don't want",
70
+ "celebrities use",
71
+ "scientists discovered",
72
+ "research shows",
73
+ "studies prove",
74
+ ],
75
+ }
76
+
77
+
78
+ def extract_enhanced_features(texts):
79
+ """Extract comprehensive handcrafted features"""
80
+ features = []
81
+
82
+ for text in texts:
83
+ if not isinstance(text, str):
84
+ text = str(text) if text is not None else ""
85
+
86
+ text_lower = text.lower()
87
+ feature_vector = []
88
+
89
+ # Clickbait pattern scores by category
90
+ for category, patterns in clickbait_indicators.items():
91
+ category_score = 0
92
+ for pattern in patterns:
93
+ if isinstance(pattern, str):
94
+ if pattern in text_lower:
95
+ category_score += 1
96
+ else: # regex pattern
97
+ if re.search(pattern, text_lower):
98
+ category_score += 1
99
+
100
+ # Normalize by pattern count in category
101
+ normalized_score = min(category_score / len(patterns), 1.0)
102
+ feature_vector.append(normalized_score)
103
+
104
+ # Punctuation and formatting features
105
+ exclamation_ratio = text.count("!") / max(len(text), 1)
106
+ question_ratio = text.count("?") / max(len(text), 1)
107
+ caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
108
+
109
+ feature_vector.extend(
110
+ [
111
+ min(exclamation_ratio * 10, 1.0),
112
+ min(question_ratio * 10, 1.0),
113
+ min(caps_ratio * 5, 1.0),
114
+ ]
115
+ )
116
+
117
+ # Length and structure features
118
+ words = text.split()
119
+ word_count = len(words)
120
+ avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
121
+
122
+ feature_vector.extend(
123
+ [
124
+ min(word_count / 20, 1.0), # Normalized word count
125
+ min(avg_word_length / 8, 1.0), # Normalized avg word length
126
+ 1.0 if word_count > 10 else 0.0, # Long headline indicator
127
+ ]
128
+ )
129
+
130
+ # Semantic features
131
+ all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
132
+ number_count = len(
133
+ [word for word in words if any(char.isdigit() for char in word)]
134
+ )
135
+
136
+ feature_vector.extend(
137
+ [
138
+ min(all_caps_words / max(word_count, 1), 1.0),
139
+ min(number_count / max(word_count, 1), 1.0),
140
+ ]
141
+ )
142
+
143
+ features.append(feature_vector)
144
+
145
+ return np.array(features)
deploy/utils/content_extractor.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from newspaper import Article
6
+
7
+
8
+ def extract_content(
9
+ url: str,
10
+ content_cache: dict,
11
+ cache_key: str,
12
+ get_user_agent,
13
+ timeout: int,
14
+ cache_size: int,
15
+ ) -> str:
16
+ """Enhanced content extraction with newspaper3k fallback to BeautifulSoup."""
17
+ if cache_key in content_cache:
18
+ return content_cache[cache_key]
19
+
20
+ try:
21
+ # Try newspaper3k first
22
+ article = Article(url)
23
+ article.download()
24
+ article.parse()
25
+
26
+ content = article.text
27
+
28
+ # If newspaper3k didn't get good content, fallback to BeautifulSoup
29
+ if not content or len(content.strip()) < 100:
30
+ content = _fallback_extraction(url, get_user_agent, timeout)
31
+
32
+ # Clean and normalize content
33
+ content = _clean_content(content)
34
+ content = content[:10000] # Increased from 8000
35
+
36
+ # Cache result
37
+ if len(content_cache) >= cache_size:
38
+ oldest_key = next(iter(content_cache))
39
+ del content_cache[oldest_key]
40
+
41
+ content_cache[cache_key] = content
42
+ return content
43
+
44
+ except Exception:
45
+ # If newspaper3k fails, try BeautifulSoup fallback
46
+ try:
47
+ content = _fallback_extraction(url, get_user_agent, timeout)
48
+ content = _clean_content(content)
49
+ content = content[:10000]
50
+
51
+ if len(content_cache) >= cache_size:
52
+ oldest_key = next(iter(content_cache))
53
+ del content_cache[oldest_key]
54
+
55
+ content_cache[cache_key] = content
56
+ return content
57
+ except Exception:
58
+ return ""
59
+
60
+
61
+ def _fallback_extraction(url: str, get_user_agent, timeout: int) -> str:
62
+ """Fallback extraction using BeautifulSoup."""
63
+ headers = {
64
+ "User-Agent": get_user_agent(),
65
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
66
+ "Accept-Language": "en-US,en;q=0.5",
67
+ "Accept-Encoding": "gzip, deflate",
68
+ "Connection": "keep-alive",
69
+ }
70
+
71
+ time.sleep(0.5)
72
+
73
+ response = requests.get(url, headers=headers, timeout=timeout)
74
+ response.raise_for_status()
75
+
76
+ # Handle encoding
77
+ if response.encoding is None or response.encoding.lower() in ["iso-8859-1", "ascii"]:
78
+ response.encoding = "utf-8"
79
+
80
+ try:
81
+ html_content = response.text
82
+ except UnicodeDecodeError:
83
+ try:
84
+ html_content = response.content.decode("utf-8", errors="ignore")
85
+ except UnicodeDecodeError:
86
+ html_content = response.content.decode("latin-1", errors="replace")
87
+
88
+ soup = BeautifulSoup(html_content, "html.parser")
89
+
90
+ # Remove irrelevant content
91
+ for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "iframe"]):
92
+ element.decompose()
93
+
94
+ # Extract content using selectors
95
+ content_selectors = [
96
+ "article",
97
+ "main",
98
+ '[role="main"]',
99
+ ".content",
100
+ ".article-content",
101
+ ".post-content",
102
+ ".entry-content",
103
+ ".article-body",
104
+ ]
105
+
106
+ extracted_text = ""
107
+ for selector in content_selectors:
108
+ elements = soup.select(selector)
109
+ if elements:
110
+ extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in elements])
111
+ break
112
+
113
+ if not extracted_text:
114
+ content_elements = soup.find_all(["p", "div"], class_=lambda x: x is None or "ad" not in str(x).lower())
115
+ extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in content_elements])
116
+
117
+ if not extracted_text:
118
+ extracted_text = soup.get_text(separator=" ", strip=True)
119
+
120
+ return extracted_text
121
+
122
+
123
+ def _clean_content(content: str) -> str:
124
+ """Clean and normalize extracted content."""
125
+ # Clean problematic characters
126
+ content = content.replace("\ufffd", " ")
127
+ content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x84\x86-\x9f]", " ", content)
128
+
129
+ # Normalize unicode if available
130
+ try:
131
+ import unicodedata
132
+ content = unicodedata.normalize("NFKD", content)
133
+ except:
134
+ pass
135
+
136
+ # Normalize whitespace and clean
137
+ content = re.sub(r"\s+", " ", content).strip()
138
+ content = re.sub(r"[^\x20-\x7E\u00A0-\uFFFF]", " ", content)
139
+
140
+ return content
deploy/utils/general_utils.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tldextract
2
+ import re
3
+ from multiprocessing import Pool, cpu_count
4
+ from tqdm import tqdm
5
+
6
+ # Trusted domains
7
+ TRUSTED_DOMAINS = {
8
+ # 🌍 International Mainstream News
9
+ "abcnews.go.com",
10
+ "aljazeera.com",
11
+ "apnews.com",
12
+ "bbc.com",
13
+ "bloomberg.com",
14
+ "cbc.ca",
15
+ "cbsnews.com",
16
+ "cnn.com",
17
+ "dw.com",
18
+ "economist.com",
19
+ "euronews.com",
20
+ "forbes.com",
21
+ "ft.com",
22
+ "indiatimes.com",
23
+ "japantimes.co.jp",
24
+ "latimes.com",
25
+ "npr.org",
26
+ "nytimes.com",
27
+ "reuters.com",
28
+ "smh.com.au",
29
+ "theguardian.com",
30
+ "usatoday.com",
31
+ "washingtonpost.com",
32
+ "wsj.com",
33
+ "france24.com",
34
+ # πŸ“° Ghana-Specific News
35
+ "3news.com",
36
+ "adomonline.com",
37
+ "citinewsroom.com",
38
+ "ghanaweb.com",
39
+ "ghanaiantimes.com.gh",
40
+ "ghananewsagency.org",
41
+ "graphic.com.gh",
42
+ "modernghana.com",
43
+ "myjoyonline.com",
44
+ "peacefmonline.com",
45
+ "pulse.com.gh",
46
+ "starrfm.com.gh",
47
+ "thebftonline.com",
48
+ "yen.com.gh",
49
+ "nsmq.com.gh",
50
+ # ⚽ Sports News
51
+ "cbssports.com",
52
+ "espn.com",
53
+ "eurosport.com",
54
+ "fifa.com",
55
+ "footballghana.com",
56
+ "foxsports.com",
57
+ "ghanasoccernet.com",
58
+ "goal.com",
59
+ "nba.com",
60
+ "nbcsports.com",
61
+ "onefootball.com",
62
+ "skysports.com",
63
+ "sportinglife.com",
64
+ "supersport.com",
65
+ "tntsports.co.uk",
66
+ "theathletic.com",
67
+ "olympics.com",
68
+ # 🎬 Entertainment & Pop Culture
69
+ "billboard.com",
70
+ "deadline.com",
71
+ "entertainment.com",
72
+ "eonline.com",
73
+ "ew.com",
74
+ "hollywoodreporter.com",
75
+ "indiewire.com",
76
+ "people.com",
77
+ "rollingstone.com",
78
+ "thewrap.com",
79
+ "variety.com",
80
+ # πŸ§ͺ Science & Research
81
+ "eurekalert.org",
82
+ "medpagetoday.com",
83
+ "nasa.gov",
84
+ "nature.com",
85
+ "sciencealert.com",
86
+ "sciencenews.org",
87
+ "statnews.com",
88
+ # 🌐 Fact-Checking & Watchdogs
89
+ "africacheck.org",
90
+ "factcheck.org",
91
+ "fullfact.org",
92
+ "politifact.com",
93
+ "snopes.com",
94
+ # 🌍 Global & General Niche News
95
+ "asia.nikkei.com",
96
+ "globalissues.org",
97
+ "ipsnews.net",
98
+ "oecdobserver.org",
99
+ "rferl.org",
100
+ # πŸ“° African Regional News (non-Ghana)
101
+ "dailynation.africa",
102
+ "enca.com",
103
+ "ewn.co.za",
104
+ "monitor.co.ug",
105
+ "thecitizen.co.tz",
106
+ "businessinsider.com",
107
+ "africanews.com",
108
+ # πŸŽ“ Academic & Policy Think Tanks
109
+ "brookings.edu",
110
+ "carnegieendowment.org",
111
+ "cfr.org",
112
+ "foreignpolicy.com",
113
+ "theconversation.com",
114
+ }
115
+
116
+ # Suspicious domains that often spread misinformation
117
+ SUSPICIOUS_DOMAINS = {
118
+ "beforeitsnews.com",
119
+ "naturalnews.com",
120
+ "infowars.com",
121
+ "breitbart.com",
122
+ "dailystormer.com",
123
+ "zerohedge.com",
124
+ "activistpost.com",
125
+ "realfarmacy.com",
126
+ "healthnutnews.com",
127
+ }
128
+
129
+
130
+ def extract_domain(url):
131
+ """Extract domain from URL"""
132
+ ext = tldextract.extract(url)
133
+ return f"{ext.domain}.{ext.suffix}"
134
+
135
+
136
+ _PATTERNS = [
137
+ (re.compile(r"\b[A-Z]+\s*\(Reuters\)\s*[-–—]?\s*", re.IGNORECASE), ""),
138
+ (re.compile(r"\(Reuters\)", re.IGNORECASE), ""),
139
+ (re.compile(r"Reuters", re.IGNORECASE), ""),
140
+ (
141
+ re.compile(
142
+ r"\b(?:WASHINGTON|NEW YORK|LONDON|PARIS|BERLIN|TOKYO|MOSCOW|BEIJING|DELHI)\s*[-–—]?\s*",
143
+ re.IGNORECASE,
144
+ ),
145
+ "",
146
+ ),
147
+ (re.compile(r"\b(?:AP|CNN|BBC|Fox News|NBC|CBS|ABC News)\b", re.IGNORECASE), ""),
148
+ (re.compile(r"\bBy\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", re.IGNORECASE), ""),
149
+ (re.compile(r"\S+@\S+\.\S+"), ""),
150
+ (re.compile(r"http[s]?://\S+"), ""),
151
+ (re.compile(r"[^a-zA-Z\s]"), " "),
152
+ (re.compile(r"\s+"), " "),
153
+ ]
154
+
155
+
156
+ def remove_source_artifacts_fast(text):
157
+ """Optimized version of source artifact removal"""
158
+ if not isinstance(text, str) or len(text) < 10:
159
+ return ""
160
+
161
+ for pattern, replacement in _PATTERNS:
162
+ text = pattern.sub(replacement, text)
163
+
164
+ return text.strip().lower()
165
+
166
+
167
+ def _process_text_chunk(text_chunk):
168
+ """Internal helper to process a chunk of texts in parallel"""
169
+ return [remove_source_artifacts_fast(text) for text in text_chunk]
170
+
171
+
172
+ def parallel_preprocess(texts, n_jobs=None):
173
+ """Parallel preprocessing of texts using multiprocessing"""
174
+ if n_jobs is None:
175
+ n_jobs = min(cpu_count(), 8)
176
+
177
+ chunk_size = max(1, len(texts) // n_jobs)
178
+ chunks = [texts[i : i + chunk_size] for i in range(0, len(texts), chunk_size)]
179
+
180
+ print(
181
+ f"Processing {len(texts)} texts in {len(chunks)} chunks using {n_jobs} processes..."
182
+ )
183
+
184
+ with Pool(n_jobs) as pool:
185
+ results = list(
186
+ tqdm(
187
+ pool.imap(_process_text_chunk, chunks),
188
+ total=len(chunks),
189
+ desc="Preprocessing chunks",
190
+ )
191
+ )
192
+
193
+ processed_texts = []
194
+ for chunk_result in results:
195
+ processed_texts.extend(chunk_result)
196
+
197
+ return processed_texts
deploy/utils/url_filter.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def _is_pdf_or_download_url(url: str) -> bool:
5
+ """Check if URL points to a PDF or download file."""
6
+ url_lower = url.lower()
7
+
8
+ # Check for PDF in URL path
9
+ if url_lower.endswith(".pdf"):
10
+ return True
11
+
12
+ # Check for PDF in URL path with query parameters
13
+ if ".pdf?" in url_lower or ".pdf#" in url_lower:
14
+ return True
15
+
16
+ # Check for other document/download formats
17
+ download_extensions = [
18
+ ".doc",
19
+ ".docx",
20
+ ".xls",
21
+ ".xlsx",
22
+ ".ppt",
23
+ ".pptx",
24
+ ".zip",
25
+ ".rar",
26
+ ".tar",
27
+ ".gz",
28
+ ".7z",
29
+ ".mp3",
30
+ ".mp4",
31
+ ".avi",
32
+ ".mov",
33
+ ".wmv",
34
+ ".exe",
35
+ ".msi",
36
+ ".dmg",
37
+ ".pkg",
38
+ ".epub",
39
+ ".mobi",
40
+ ".djvu",
41
+ ]
42
+
43
+ for ext in download_extensions:
44
+ if url_lower.endswith(ext) or f"{ext}?" in url_lower or f"{ext}#" in url_lower:
45
+ return True
46
+
47
+ # Check for common download URL patterns
48
+ download_patterns = [
49
+ r"/download/",
50
+ r"/downloads/",
51
+ r"/attachments/",
52
+ r"/files/",
53
+ r"/uploads/",
54
+ r"/wp-content/uploads/",
55
+ r"/content/uploads/",
56
+ r"/assets/downloads/",
57
+ r"/documents/",
58
+ r"/pdfs/",
59
+ r"\.pdf$",
60
+ r"\.pdf\?",
61
+ r"\.pdf#",
62
+ r"attachment\.aspx",
63
+ r"download\.aspx",
64
+ r"getfile\.aspx",
65
+ r"viewdocument\.aspx",
66
+ ]
67
+
68
+ return any(re.search(pattern, url_lower) for pattern in download_patterns)
69
+
70
+
71
+ def _is_corrupted_pdf_content(content: str) -> bool:
72
+ """Detect if content appears to be corrupted PDF text."""
73
+ if not content or len(content.strip()) < 10:
74
+ return False
75
+
76
+ # Common PDF corruption indicators
77
+ pdf_corruption_patterns = [
78
+ r"endstream\s+endobj",
79
+ r"obj\s*<[^>]*>\s*stream",
80
+ r"%PDF-\d+\.\d+",
81
+ r"xref\s+\d+",
82
+ r"trailer\s*<<",
83
+ r"startxref",
84
+ r"%%EOF",
85
+ r"stream\s+H\s+[^\w\s]{10,}", # Stream followed by garbled text
86
+ r"[^\w\s]{20,}", # Long sequences of non-word/space characters
87
+ r"obj\s+<\s*>\s*stream",
88
+ r"BT\s+/F\d+", # PDF text object indicators
89
+ r"ET\s+Q\s+q", # PDF graphics state operators
90
+ ]
91
+
92
+ corruption_score = 0
93
+ for pattern in pdf_corruption_patterns:
94
+ if re.search(pattern, content, re.IGNORECASE):
95
+ corruption_score += 1
96
+
97
+ # Check character distribution - PDFs often have weird character distributions
98
+ if len(content) > 50:
99
+ # Count non-printable or unusual characters
100
+ unusual_chars = sum(
101
+ 1 for c in content if ord(c) > 127 or (ord(c) < 32 and c not in "\t\n\r ")
102
+ )
103
+ unusual_ratio = unusual_chars / len(content)
104
+
105
+ if unusual_ratio > 0.3: # More than 30% unusual characters
106
+ corruption_score += 2
107
+
108
+ # Check for excessive special characters in a row
109
+ if re.search(r"[^\w\s]{15,}", content):
110
+ corruption_score += 1
111
+
112
+ # Check for PDF-specific garbled patterns
113
+ if re.search(r"[A-Za-z0-9]{2,}\s+[^\w\s]{5,}\s+[A-Za-z0-9]{2,}", content):
114
+ corruption_score += 1
115
+
116
+ return corruption_score >= 2
models/clickbait/feature_info.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:053e642a1e6692cd3ca116bb36aca8aab7c65f45ac07ccd75babd638debf07e3
3
+ size 1126
models/clickbait/logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c03ee9383f2dd1fe51d335adbc292975fd051202d981cdff0805a16941b6f80
3
+ size 40845
models/clickbait/tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1e7a97703602d9eafb046d14e9dd776fce4dbc050cf7ca201fedbc4dd31b13c
3
+ size 186468
requirements.txt CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
1
  sentence-transformers==4.1.0
2
  torch==2.7.1
 
3
  textblob==0.19.0
 
1
+ beautifulsoup4==4.13.4
2
+ googlesearch-python==1.3.0
3
+ numpy==2.0.2
4
+ requests==2.32.3
5
+ tldextract==5.3.0
6
+ tqdm==4.67.1
7
+ newspaper3k
8
+ lxml_html_clean
9
+ nltk==3.9.1
10
  sentence-transformers==4.1.0
11
  torch==2.7.1
12
+ scikit-learn==1.6.1
13
  textblob==0.19.0
semantic_similarity.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
4
+ from typing import List
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from textblob import TextBlob
8
+
9
+ model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
10
+ model.eval()
11
+
12
+
13
+ def calculate_semantic_similarity(
14
+ claim: str, sentences: List[str], similarity_threshold: float = 0.4
15
+ ) -> float:
16
+ """
17
+ Calculates a weighted score representing how well a list of sentences supports a claim.
18
+ Args:
19
+ claim (str): The claim to be verified.
20
+ sentences (List[str]): A list of sentences to check against the claim.
21
+ similarity_threshold (float, optional): The minimum similarity score for a
22
+ sentence to be considered "supporting". Defaults to 0.5.
23
+
24
+ Returns:
25
+ float: A weighted score between 0.0 and 1.0.
26
+ """
27
+ if not sentences:
28
+ return 0.0
29
+
30
+ all_scores = []
31
+
32
+ with torch.no_grad():
33
+ claim_embedding = model.encode(claim, show_progress_bar=False)
34
+ sentence_embeddings = model.encode(sentences, show_progress_bar=False)
35
+ cosine_scores = util.cos_sim(claim_embedding, sentence_embeddings)[0]
36
+ claim_sentiment = TextBlob(claim).sentiment.polarity
37
+
38
+ for i, sentence in enumerate(sentences):
39
+ similarity = cosine_scores[i].item()
40
+ sentence_sentiment = TextBlob(sentence).sentiment.polarity
41
+
42
+ if claim_sentiment * sentence_sentiment > 0:
43
+ similarity *= 1.1
44
+ elif claim_sentiment * sentence_sentiment < 0:
45
+ similarity *= 0.9
46
+
47
+ # print(f"Sentence: {sentence}\nSimilarity: {similarity:.2f}\n")
48
+ similarity = max(0.0, min(1.0, similarity))
49
+ all_scores.append(similarity)
50
+
51
+ supporting_scores = [s for s in all_scores if s >= similarity_threshold]
52
+ proportion_supporting = len(supporting_scores) / len(sentences)
53
+
54
+ if proportion_supporting >= 0.30:
55
+ final_score = sum(supporting_scores) / len(supporting_scores)
56
+ else:
57
+ average_all_scores = sum(all_scores) / len(all_scores)
58
+ # penalty = 0.80 # 20% reduction
59
+ final_score = average_all_scores # * penalty
60
+
61
+ return final_score
62
+
63
+
64
+ if __name__ == "__main__":
65
+ while True:
66
+ claim_to_verify = input("Enter claim to verify: ")
67
+ evidence = input("Enter evidence sentences: ")
68
+ evidence_sentences = [
69
+ "The recent legislation is projected to stimulate significant economic growth.", # High similarity
70
+ "Market analysts are optimistic about the financial future following the announcement.", # High similarity
71
+ "However, some critics argue that the policy might lead to unforeseen inflation.", # Low similarity
72
+ "The stock market reacted positively, showing a slight increase.", # Medium similarity
73
+ "This is considered a poor decision for the nation's financial stability by some experts.", # Opposing sentiment
74
+ "The primary goal of the initiative is to create jobs and encourage consumer spending.", # High similarity
75
+ "Unemployment rates are expected to decline in the coming months.", # High similarity
76
+ "There has been some public disapproval regarding the policy's rollout.", # Low similarity
77
+ "This will surely lead to a stronger and more resilient economy.", # High similarity
78
+ "Financial experts have voiced concerns about the potential long-term consequences.", # Opposing sentiment
79
+ ]
80
+
81
+ final_score = calculate_semantic_similarity(claim_to_verify, [evidence.strip()])
82
+
83
+ print(f"The final weighted support score for the claim is: {final_score:.4f}")
84
+
85
+ if final_score > 0.65:
86
+ print("Interpretation: The claim is strongly supported by the evidence. βœ…")
87
+ elif final_score > 0.4:
88
+ print(
89
+ "Interpretation: The claim has moderate support from the evidence. πŸ€”"
90
+ )
91
+ else:
92
+ print("Interpretation: The claim has weak support from the evidence. ❌")