Spaces:
Running
Running
Commit
Β·
f078461
1
Parent(s):
b372940
Fake new dectector with Gradio interface
Browse files- .gitattributes +5 -0
- app.py +8 -55
- deploy/__init__.py +0 -0
- deploy/index.py +361 -0
- deploy/main/__init__.py +0 -0
- deploy/main/claim_verifier.py +371 -0
- deploy/main/network_analyzer.py +259 -0
- deploy/main/predict_clickbait.py +43 -0
- deploy/main/source_credibility_analyzer.py +192 -0
- deploy/utils/__init__.py +0 -0
- deploy/utils/clickbait_utils.py +145 -0
- deploy/utils/content_extractor.py +140 -0
- deploy/utils/general_utils.py +197 -0
- deploy/utils/url_filter.py +116 -0
- models/clickbait/feature_info.pkl +3 -0
- models/clickbait/logistic_regression_model.pkl +3 -0
- models/clickbait/tfidf_vectorizer.pkl +3 -0
- requirements.txt +10 -0
- semantic_similarity.py +92 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
.vscode
|
| 37 |
+
.venv
|
| 38 |
+
venv
|
| 39 |
+
**/__pycache__/
|
| 40 |
+
snli_1.0_dev.jsonl
|
app.py
CHANGED
|
@@ -1,67 +1,20 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
| 3 |
|
| 4 |
-
from typing import List
|
| 5 |
-
import torch
|
| 6 |
-
from sentence_transformers import SentenceTransformer, util
|
| 7 |
-
from textblob import TextBlob
|
| 8 |
import gradio as gr
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
model.eval()
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def calculate_semantic_similarity(
|
| 15 |
-
claim: str, sentences_input: str, similarity_threshold: float = 0.4
|
| 16 |
-
) -> float:
|
| 17 |
-
"""
|
| 18 |
-
Accepts a claim and newline-separated sentences. Returns a weighted similarity score.
|
| 19 |
-
"""
|
| 20 |
-
sentences = [s.strip() for s in sentences_input.split("\n") if s.strip()]
|
| 21 |
-
if not sentences:
|
| 22 |
-
return 0.0
|
| 23 |
-
|
| 24 |
-
all_scores = []
|
| 25 |
-
|
| 26 |
-
with torch.no_grad():
|
| 27 |
-
claim_embedding = model.encode(claim, show_progress_bar=False)
|
| 28 |
-
sentence_embeddings = model.encode(sentences, show_progress_bar=False)
|
| 29 |
-
cosine_scores = util.cos_sim(claim_embedding, sentence_embeddings)[0]
|
| 30 |
-
claim_sentiment = TextBlob(claim).sentiment.polarity
|
| 31 |
-
|
| 32 |
-
for i, sentence in enumerate(sentences):
|
| 33 |
-
similarity = cosine_scores[i].item()
|
| 34 |
-
sentence_sentiment = TextBlob(sentence).sentiment.polarity
|
| 35 |
-
|
| 36 |
-
if claim_sentiment * sentence_sentiment > 0:
|
| 37 |
-
similarity *= 1.1
|
| 38 |
-
elif claim_sentiment * sentence_sentiment < 0:
|
| 39 |
-
similarity *= 0.9
|
| 40 |
-
|
| 41 |
-
similarity = max(0.0, min(1.0, similarity))
|
| 42 |
-
all_scores.append(similarity)
|
| 43 |
-
|
| 44 |
-
supporting_scores = [s for s in all_scores if s >= similarity_threshold]
|
| 45 |
-
proportion_supporting = len(supporting_scores) / len(sentences)
|
| 46 |
-
|
| 47 |
-
if proportion_supporting >= 0.30:
|
| 48 |
-
final_score = sum(supporting_scores) / len(supporting_scores)
|
| 49 |
-
else:
|
| 50 |
-
final_score = sum(all_scores) / len(all_scores)
|
| 51 |
-
|
| 52 |
-
return round(final_score, 4)
|
| 53 |
-
|
| 54 |
|
| 55 |
iface = gr.Interface(
|
| 56 |
-
fn=
|
| 57 |
inputs=[
|
| 58 |
-
gr.Textbox(label="
|
| 59 |
-
gr.Textbox(lines=10, label="Evidence Sentences (one per line)"),
|
| 60 |
-
gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.4, label="Similarity Threshold")
|
| 61 |
],
|
| 62 |
-
outputs=gr.
|
| 63 |
-
title="
|
| 64 |
-
description="Input a
|
| 65 |
)
|
| 66 |
|
| 67 |
iface.launch()
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
+
from deploy.index import FakeNewsDetector
|
| 7 |
|
| 8 |
+
detector = FakeNewsDetector()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
iface = gr.Interface(
|
| 11 |
+
fn=detector.comprehensive_verify,
|
| 12 |
inputs=[
|
| 13 |
+
gr.Textbox(label="Headline"),
|
|
|
|
|
|
|
| 14 |
],
|
| 15 |
+
outputs=gr.JSON(label="Analysis Result"), # JSON output for structured verdict
|
| 16 |
+
title="Fake News Detector",
|
| 17 |
+
description="Input a headline to check how credible it is.",
|
| 18 |
)
|
| 19 |
|
| 20 |
iface.launch()
|
deploy/__init__.py
ADDED
|
File without changes
|
deploy/index.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
| 5 |
+
import gc
|
| 6 |
+
import time
|
| 7 |
+
import random
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Dict, List, Tuple, Any
|
| 10 |
+
import numpy as np
|
| 11 |
+
from googlesearch import search
|
| 12 |
+
|
| 13 |
+
from deploy.main.claim_verifier import ClaimVerifier
|
| 14 |
+
from deploy.main.network_analyzer import NetworkAnalyzer
|
| 15 |
+
from deploy.main.source_credibility_analyzer import SourceCredibilityAnalyzer
|
| 16 |
+
from deploy.utils.general_utils import extract_domain
|
| 17 |
+
from deploy.main.predict_clickbait import ClickbaitPredictor
|
| 18 |
+
|
| 19 |
+
import nltk
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
nltk.data.find("tokenizers/punkt")
|
| 23 |
+
nltk.data.find("tokenizers/punkt_tab")
|
| 24 |
+
except LookupError:
|
| 25 |
+
nltk.download("punkt")
|
| 26 |
+
nltk.download("punkt_tab")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class FakeNewsDetector:
|
| 30 |
+
"""Main enhanced fact checker with ML integration"""
|
| 31 |
+
|
| 32 |
+
def __init__(self):
|
| 33 |
+
try:
|
| 34 |
+
self.source_analyzer = SourceCredibilityAnalyzer()
|
| 35 |
+
self.claim_verifier = ClaimVerifier()
|
| 36 |
+
self.network_analyzer = NetworkAnalyzer()
|
| 37 |
+
self.clickbait_predictor = ClickbaitPredictor()
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"β Error initializing components: {e}")
|
| 40 |
+
raise
|
| 41 |
+
|
| 42 |
+
def _to_float(self, value: Any, default: float = 0.0) -> float:
|
| 43 |
+
"""Safely convert any numeric value to Python float"""
|
| 44 |
+
try:
|
| 45 |
+
if isinstance(value, (np.integer, np.floating)):
|
| 46 |
+
return float(value)
|
| 47 |
+
elif isinstance(value, (int, float)):
|
| 48 |
+
return float(value)
|
| 49 |
+
else:
|
| 50 |
+
return default
|
| 51 |
+
except (ValueError, TypeError):
|
| 52 |
+
return default
|
| 53 |
+
|
| 54 |
+
def _analyze_clickbait(self, headline: str) -> float:
|
| 55 |
+
"""Analyzes the headline for clickbait characteristics."""
|
| 56 |
+
print("π§ ML Clickbait Analysis...")
|
| 57 |
+
try:
|
| 58 |
+
_, clickbait_score, _ = self.clickbait_predictor.predict(headline)
|
| 59 |
+
clickbait_score = self._to_float(clickbait_score, 0.5)
|
| 60 |
+
print(f" Clickbait Score: {clickbait_score:.2f}")
|
| 61 |
+
return clickbait_score
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f" β Clickbait analysis error: {e}")
|
| 64 |
+
return 0.5 # Default moderate score
|
| 65 |
+
|
| 66 |
+
def _search_for_sources(self, headline: str, num_results: int) -> List[str]:
|
| 67 |
+
"""Searches the web for sources related to the headline."""
|
| 68 |
+
# print("π Searching and analyzing sources...")
|
| 69 |
+
try:
|
| 70 |
+
time.sleep(random.uniform(1.5, 3.0))
|
| 71 |
+
search_results = list(search(headline, num_results=num_results, lang="en"))
|
| 72 |
+
# print(f" Found {len(search_results)} search results")
|
| 73 |
+
return search_results
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f" β Search error: {e}")
|
| 76 |
+
return []
|
| 77 |
+
|
| 78 |
+
def _analyze_source_credibility(
|
| 79 |
+
self, search_results: List[str]
|
| 80 |
+
) -> Tuple[float, int, int]:
|
| 81 |
+
"""Analyzes the credibility of the found source domains."""
|
| 82 |
+
print("π Analyzing source credibility...")
|
| 83 |
+
|
| 84 |
+
if not search_results:
|
| 85 |
+
print(" β No search results to analyze")
|
| 86 |
+
return 0.1, 0, 0
|
| 87 |
+
|
| 88 |
+
source_scores = []
|
| 89 |
+
trusted_count = 0
|
| 90 |
+
suspicious_count = 0
|
| 91 |
+
|
| 92 |
+
for i, url in enumerate(search_results):
|
| 93 |
+
try:
|
| 94 |
+
domain = extract_domain(url)
|
| 95 |
+
credibility_score = self.source_analyzer.analyze_domain_credibility(
|
| 96 |
+
domain
|
| 97 |
+
)
|
| 98 |
+
credibility_score = self._to_float(credibility_score, 0.5)
|
| 99 |
+
source_scores.append(credibility_score)
|
| 100 |
+
|
| 101 |
+
if credibility_score > 0.7:
|
| 102 |
+
trusted_count += 1
|
| 103 |
+
print(f" {i+1}. {domain} β
({credibility_score:.2f})")
|
| 104 |
+
elif credibility_score < 0.3:
|
| 105 |
+
suspicious_count += 1
|
| 106 |
+
print(f" {i+1}. {domain} β ({credibility_score:.2f})")
|
| 107 |
+
else:
|
| 108 |
+
print(f" {i+1}. {domain} β ({credibility_score:.2f})")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f" β Error analyzing {url}: {e}")
|
| 111 |
+
source_scores.append(0.3) # Default neutral score
|
| 112 |
+
|
| 113 |
+
# Use regular Python mean instead of np.mean
|
| 114 |
+
avg_credibility = (
|
| 115 |
+
sum(source_scores) / len(source_scores) if source_scores else 0.1
|
| 116 |
+
)
|
| 117 |
+
return avg_credibility, trusted_count, suspicious_count
|
| 118 |
+
|
| 119 |
+
def _analyze_network_propagation(
|
| 120 |
+
self, search_results: List[str]
|
| 121 |
+
) -> Dict[str, float]:
|
| 122 |
+
"""Analyzes the propagation pattern of the news across the network."""
|
| 123 |
+
print("π Network Propagation Analysis...")
|
| 124 |
+
|
| 125 |
+
if not search_results:
|
| 126 |
+
print(" β No search results for network analysis")
|
| 127 |
+
return {"score": 0.1, "domain_diversity": 0.0}
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
network_analysis = self.network_analyzer.analyze_propagation_pattern(
|
| 131 |
+
search_results
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Convert all values to Python floats
|
| 135 |
+
result = {
|
| 136 |
+
"score": self._to_float(network_analysis.get("score", 0.1)),
|
| 137 |
+
"domain_diversity": self._to_float(
|
| 138 |
+
network_analysis.get("domain_diversity", 0.0)
|
| 139 |
+
),
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
print(f" Propagation Score: {result['score']:.2f}")
|
| 143 |
+
print(f" Domain Diversity: {result['domain_diversity']:.2f}")
|
| 144 |
+
return result
|
| 145 |
+
except Exception as e:
|
| 146 |
+
print(f" β Network analysis error: {e}")
|
| 147 |
+
return {"score": 0.1, "domain_diversity": 0.0}
|
| 148 |
+
|
| 149 |
+
def _verify_claim(self, headline: str, search_results: List[str]) -> float:
|
| 150 |
+
"""Verifies the claim against the content of the found sources."""
|
| 151 |
+
print("β
Verifying Claims...")
|
| 152 |
+
|
| 153 |
+
if not search_results:
|
| 154 |
+
print(" β No search results for claim verification")
|
| 155 |
+
return 0.4
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
verification = self.claim_verifier.verify_claim_against_sources(
|
| 159 |
+
headline, search_results
|
| 160 |
+
)
|
| 161 |
+
claim_verification_score = self._to_float(verification.get("score", 0.4))
|
| 162 |
+
print(f" '{headline}': {claim_verification_score:.2f}")
|
| 163 |
+
return claim_verification_score
|
| 164 |
+
except Exception as e:
|
| 165 |
+
print(f" β Claim verification error: {e}")
|
| 166 |
+
return 0.4
|
| 167 |
+
|
| 168 |
+
def _calculate_final_score_and_verdict(
|
| 169 |
+
self, component_scores: Dict[str, float]
|
| 170 |
+
) -> Tuple[float, str, str]:
|
| 171 |
+
"""Calculates the final weighted score and determines the verdict."""
|
| 172 |
+
weights = {
|
| 173 |
+
"source_credibility": 0.35,
|
| 174 |
+
"claim_verification": 0.35,
|
| 175 |
+
"network_propagation": 0.20,
|
| 176 |
+
"clickbait_detection": 0.10,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
final_score = sum(
|
| 180 |
+
component_scores.get(component, 0.0) * weight
|
| 181 |
+
for component, weight in weights.items()
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
if final_score >= 0.75:
|
| 185 |
+
verdict = "Credible β Backed by Evidence"
|
| 186 |
+
confidence = "Very High"
|
| 187 |
+
elif final_score >= 0.60:
|
| 188 |
+
verdict = "Likely True β Supported by Sources"
|
| 189 |
+
confidence = "High"
|
| 190 |
+
elif final_score >= 0.45:
|
| 191 |
+
verdict = "Unclear β Conflicting Information"
|
| 192 |
+
confidence = "Moderate"
|
| 193 |
+
elif final_score >= 0.30:
|
| 194 |
+
verdict = "Doubtful β Weak or Biased Evidence"
|
| 195 |
+
confidence = "Low"
|
| 196 |
+
else:
|
| 197 |
+
verdict = "False or Misleading β No Basis Found"
|
| 198 |
+
confidence = "Very Low"
|
| 199 |
+
|
| 200 |
+
return final_score, verdict, confidence
|
| 201 |
+
|
| 202 |
+
def _print_summary(self, results: Dict):
|
| 203 |
+
"""Prints a formatted summary of the analysis results."""
|
| 204 |
+
final_verdict = results["final_verdict"]
|
| 205 |
+
components = results["components"]
|
| 206 |
+
|
| 207 |
+
print(f"π COMPREHENSIVE ANALYSIS RESULTS:")
|
| 208 |
+
print(
|
| 209 |
+
f"ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
|
| 210 |
+
)
|
| 211 |
+
print(f"π― Final Score: {final_verdict['score']:.2f}/1.000")
|
| 212 |
+
print(f"π Verdict: {final_verdict['verdict']}")
|
| 213 |
+
print(f"π Confidence: {final_verdict['confidence']}")
|
| 214 |
+
|
| 215 |
+
print(f"π Component Breakdown:")
|
| 216 |
+
for component, score in final_verdict["components"].items():
|
| 217 |
+
print(f" β’ {component.replace('_', ' ').title()}: {score:.2f}")
|
| 218 |
+
|
| 219 |
+
print(f"π Summary:")
|
| 220 |
+
print(
|
| 221 |
+
f" β’ Trusted Sources: {components['source_credibility']['trusted_count']}"
|
| 222 |
+
)
|
| 223 |
+
print(
|
| 224 |
+
f" β’ Suspicious Sources: {components['source_credibility']['suspicious_count']}"
|
| 225 |
+
)
|
| 226 |
+
print(
|
| 227 |
+
f" β’ Clickbait Score: {components['clickbait']['score']:.2f} (lower is better)"
|
| 228 |
+
)
|
| 229 |
+
print(f" β’ Domain Diversity: {components['network']['domain_diversity']:.2f}")
|
| 230 |
+
|
| 231 |
+
def comprehensive_verify(
|
| 232 |
+
self, raw_headline: str, results_to_check: int = 8
|
| 233 |
+
) -> Dict:
|
| 234 |
+
"""
|
| 235 |
+
Comprehensive fact-checking with ML integration.
|
| 236 |
+
This method orchestrates the analysis by calling various specialized components.
|
| 237 |
+
"""
|
| 238 |
+
print(f'\nπ Comprehensive Analysis: "{raw_headline}"')
|
| 239 |
+
print("=" * 80)
|
| 240 |
+
|
| 241 |
+
if not raw_headline or not raw_headline.strip():
|
| 242 |
+
print("β Empty or invalid headline provided")
|
| 243 |
+
return {
|
| 244 |
+
"headline": "",
|
| 245 |
+
"timestamp": datetime.now().isoformat(),
|
| 246 |
+
"final_verdict": {
|
| 247 |
+
"verdict": "β Invalid Input",
|
| 248 |
+
"confidence": "Very High",
|
| 249 |
+
"score": 0.0,
|
| 250 |
+
"components": {
|
| 251 |
+
"claim_verification": 0.0,
|
| 252 |
+
"source_credibility": 0.0,
|
| 253 |
+
"clickbait_detection": 0.0,
|
| 254 |
+
"network_propagation": 0.0,
|
| 255 |
+
},
|
| 256 |
+
},
|
| 257 |
+
"components": {
|
| 258 |
+
"clickbait": {"score": 0.0},
|
| 259 |
+
"source_credibility": {
|
| 260 |
+
"score": 0.0,
|
| 261 |
+
"trusted_count": 0,
|
| 262 |
+
"suspicious_count": 0,
|
| 263 |
+
},
|
| 264 |
+
"network": {"score": 0.0, "domain_diversity": 0.0},
|
| 265 |
+
"claim_verification": {"score": 0.0},
|
| 266 |
+
},
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
# Step 1: Search for sources
|
| 270 |
+
search_results = self._search_for_sources(raw_headline, results_to_check)
|
| 271 |
+
|
| 272 |
+
if not search_results:
|
| 273 |
+
print("β οΈ No search results found. Assigning low credibility by default.")
|
| 274 |
+
return {
|
| 275 |
+
"headline": raw_headline,
|
| 276 |
+
"timestamp": datetime.now().isoformat(),
|
| 277 |
+
"final_verdict": {
|
| 278 |
+
"verdict": "π« HIGHLY QUESTIONABLE",
|
| 279 |
+
"confidence": "Very High",
|
| 280 |
+
"score": 0.1,
|
| 281 |
+
"components": {
|
| 282 |
+
"claim_verification": 0.1,
|
| 283 |
+
"source_credibility": 0.1,
|
| 284 |
+
"clickbait_detection": 0.1,
|
| 285 |
+
"network_propagation": 0.1,
|
| 286 |
+
},
|
| 287 |
+
},
|
| 288 |
+
"components": {
|
| 289 |
+
"clickbait": {"score": 0.5},
|
| 290 |
+
"source_credibility": {
|
| 291 |
+
"score": 0.1,
|
| 292 |
+
"trusted_count": 0,
|
| 293 |
+
"suspicious_count": 0,
|
| 294 |
+
},
|
| 295 |
+
"network": {"score": 0.1, "domain_diversity": 0.0},
|
| 296 |
+
"claim_verification": {"score": 0.1},
|
| 297 |
+
},
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
# Step 2: Run all analysis components
|
| 301 |
+
clickbait_score = self._analyze_clickbait(raw_headline)
|
| 302 |
+
avg_source_credibility, trusted_count, suspicious_count = (
|
| 303 |
+
self._analyze_source_credibility(search_results)
|
| 304 |
+
)
|
| 305 |
+
network_analysis = self._analyze_network_propagation(search_results)
|
| 306 |
+
claim_verification_score = self._verify_claim(raw_headline, search_results)
|
| 307 |
+
|
| 308 |
+
# Step 3: Consolidate component scores (ensure all are Python floats)
|
| 309 |
+
component_scores = {
|
| 310 |
+
"claim_verification": claim_verification_score,
|
| 311 |
+
"source_credibility": avg_source_credibility,
|
| 312 |
+
"clickbait_detection": 1.0 - clickbait_score, # Invert score
|
| 313 |
+
"network_propagation": network_analysis["score"],
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
# Step 4: Calculate final score and verdict
|
| 317 |
+
final_score, verdict, confidence = self._calculate_final_score_and_verdict(
|
| 318 |
+
component_scores
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# Step 5: Build the exact JSON structure you specified
|
| 322 |
+
analysis_results = {
|
| 323 |
+
"headline": raw_headline,
|
| 324 |
+
"timestamp": datetime.now().isoformat(),
|
| 325 |
+
"final_verdict": {
|
| 326 |
+
"verdict": verdict,
|
| 327 |
+
"confidence": confidence,
|
| 328 |
+
"score": round(final_score, 2),
|
| 329 |
+
"components": {
|
| 330 |
+
"claim_verification": round(
|
| 331 |
+
component_scores["claim_verification"], 2
|
| 332 |
+
),
|
| 333 |
+
"source_credibility": round(
|
| 334 |
+
component_scores["source_credibility"], 2
|
| 335 |
+
),
|
| 336 |
+
"clickbait_detection": round(
|
| 337 |
+
component_scores["clickbait_detection"], 2
|
| 338 |
+
),
|
| 339 |
+
"network_propagation": round(
|
| 340 |
+
component_scores["network_propagation"], 2
|
| 341 |
+
),
|
| 342 |
+
},
|
| 343 |
+
},
|
| 344 |
+
"components": {
|
| 345 |
+
"clickbait": {"score": round(clickbait_score, 2)},
|
| 346 |
+
"source_credibility": {
|
| 347 |
+
"score": round(avg_source_credibility, 2),
|
| 348 |
+
"trusted_count": trusted_count,
|
| 349 |
+
"suspicious_count": suspicious_count,
|
| 350 |
+
},
|
| 351 |
+
"network": {
|
| 352 |
+
"score": round(network_analysis["score"], 2),
|
| 353 |
+
"domain_diversity": round(network_analysis["domain_diversity"], 2),
|
| 354 |
+
},
|
| 355 |
+
"claim_verification": {"score": round(claim_verification_score, 2)},
|
| 356 |
+
},
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
# self._print_summary(analysis_results)
|
| 360 |
+
gc.collect()
|
| 361 |
+
return analysis_results
|
deploy/main/__init__.py
ADDED
|
File without changes
|
deploy/main/claim_verifier.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Optional, Tuple
|
| 2 |
+
import logging
|
| 3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 4 |
+
import hashlib
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
import warnings
|
| 7 |
+
import re
|
| 8 |
+
from nltk.tokenize import sent_tokenize
|
| 9 |
+
import string
|
| 10 |
+
|
| 11 |
+
from deploy.utils.general_utils import TRUSTED_DOMAINS, SUSPICIOUS_DOMAINS
|
| 12 |
+
from deploy.utils.content_extractor import extract_content
|
| 13 |
+
from deploy.utils.url_filter import _is_corrupted_pdf_content, _is_pdf_or_download_url
|
| 14 |
+
from semantic_similarity import calculate_semantic_similarity
|
| 15 |
+
|
| 16 |
+
warnings.filterwarnings("ignore")
|
| 17 |
+
|
| 18 |
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ClaimVerifier:
|
| 22 |
+
"""Enhanced claim verifier with smart sentence extraction and prioritized scraping."""
|
| 23 |
+
|
| 24 |
+
def __init__(self, cache_size: int = 500, max_workers: int = 4):
|
| 25 |
+
self.claim_cache: Dict[str, Dict] = {}
|
| 26 |
+
self.content_cache: Dict[str, str] = {}
|
| 27 |
+
self.cache_size = cache_size
|
| 28 |
+
self.max_workers = max_workers
|
| 29 |
+
self.trusted_domains = TRUSTED_DOMAINS
|
| 30 |
+
self.suspicious_domains = SUSPICIOUS_DOMAINS
|
| 31 |
+
self.domain_weights = {"trusted": 2.0, "suspicious": 0.3, "neutral": 1.0}
|
| 32 |
+
self.user_agents = [
|
| 33 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 34 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 35 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 36 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0",
|
| 37 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
|
| 38 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
|
| 39 |
+
"Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.196 Mobile Safari/537.36",
|
| 40 |
+
"Mozilla/5.0 (iPad; CPU OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
|
| 41 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/18.18363",
|
| 42 |
+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
|
| 43 |
+
]
|
| 44 |
+
self.current_ua_index = 0
|
| 45 |
+
self.timeout = 10
|
| 46 |
+
|
| 47 |
+
def _get_domain_weight(self, url: str) -> Tuple[float, str]:
|
| 48 |
+
domain = urlparse(url).netloc.lower().replace("www.", "")
|
| 49 |
+
if domain in self.trusted_domains:
|
| 50 |
+
return self.domain_weights["trusted"], "trusted"
|
| 51 |
+
elif domain in self.suspicious_domains:
|
| 52 |
+
return self.domain_weights["suspicious"], "suspicious"
|
| 53 |
+
else:
|
| 54 |
+
return self.domain_weights["neutral"], "neutral"
|
| 55 |
+
|
| 56 |
+
def _prioritize_sources(self, search_results: List[str]) -> List[str]:
|
| 57 |
+
"""Prioritize trusted sources and filter out PDFs/downloads."""
|
| 58 |
+
# First, filter out PDFs and download links
|
| 59 |
+
filtered_results = []
|
| 60 |
+
pdf_count = 0
|
| 61 |
+
|
| 62 |
+
for url in search_results:
|
| 63 |
+
if _is_pdf_or_download_url(url):
|
| 64 |
+
pdf_count += 1
|
| 65 |
+
logging.info(f"π Filtered out PDF/download URL: {url}")
|
| 66 |
+
continue
|
| 67 |
+
filtered_results.append(url)
|
| 68 |
+
|
| 69 |
+
if pdf_count > 0:
|
| 70 |
+
logging.info(f"π« Filtered out {pdf_count} PDF/download URLs")
|
| 71 |
+
|
| 72 |
+
if not filtered_results:
|
| 73 |
+
logging.warning("β οΈ No valid URLs remaining after filtering PDFs/downloads")
|
| 74 |
+
return []
|
| 75 |
+
|
| 76 |
+
# Then prioritize trusted sources
|
| 77 |
+
trusted_sources = [
|
| 78 |
+
url
|
| 79 |
+
for url in filtered_results
|
| 80 |
+
if self._get_domain_weight(url)[1] == "trusted"
|
| 81 |
+
]
|
| 82 |
+
other_sources = [
|
| 83 |
+
url
|
| 84 |
+
for url in filtered_results
|
| 85 |
+
if self._get_domain_weight(url)[1] != "trusted"
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
if len(trusted_sources) >= 4:
|
| 89 |
+
return trusted_sources[:8]
|
| 90 |
+
else:
|
| 91 |
+
return (trusted_sources + other_sources)[:8]
|
| 92 |
+
|
| 93 |
+
def _is_valid_sentence(self, sentence: str) -> bool:
|
| 94 |
+
"""Enhanced sentence validation to filter out garbled/corrupted text."""
|
| 95 |
+
sentence = sentence.strip()
|
| 96 |
+
|
| 97 |
+
# Basic length check
|
| 98 |
+
if len(sentence) < 20 or len(sentence) > 300:
|
| 99 |
+
return False
|
| 100 |
+
|
| 101 |
+
# Check for too many non-ASCII characters (garbled text indicator)
|
| 102 |
+
non_ascii_count = sum(1 for c in sentence if ord(c) > 127)
|
| 103 |
+
if non_ascii_count > len(sentence) * 0.3: # More than 30% non-ASCII
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
# Check for excessive special characters or symbols
|
| 107 |
+
special_chars = sum(
|
| 108 |
+
1 for c in sentence if c in string.punctuation and c not in ".,!?;:"
|
| 109 |
+
)
|
| 110 |
+
if special_chars > len(sentence) * 0.2: # More than 20% special chars
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
# Enhanced check for random character patterns (PDF corruption indicators)
|
| 114 |
+
if re.search(r"[^\w\s]{3,}", sentence): # 3+ consecutive non-word chars
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
# Check for PDF-specific corruption patterns
|
| 118 |
+
if re.search(r"(endstream|endobj|obj\s*<|stream\s+H)", sentence, re.IGNORECASE):
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
# Check for excessive whitespace or control characters
|
| 122 |
+
if re.search(r"\s{3,}", sentence) or any(
|
| 123 |
+
ord(c) < 32 and c not in "\t\n\r" for c in sentence
|
| 124 |
+
):
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
# Check for minimum word count and average word length
|
| 128 |
+
words = sentence.split()
|
| 129 |
+
if len(words) < 4:
|
| 130 |
+
return False
|
| 131 |
+
|
| 132 |
+
# Check for reasonable word lengths (avoid strings like "a b c d e f g")
|
| 133 |
+
avg_word_length = sum(len(word) for word in words) / len(words)
|
| 134 |
+
if avg_word_length < 2.5:
|
| 135 |
+
return False
|
| 136 |
+
|
| 137 |
+
# Check for excessive capitalization
|
| 138 |
+
if sum(1 for c in sentence if c.isupper()) > len(sentence) * 0.5:
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
# Check for sequences that look like corrupted encoding
|
| 142 |
+
if re.search(r"[^\w\s]{5,}", sentence):
|
| 143 |
+
return False
|
| 144 |
+
|
| 145 |
+
return True
|
| 146 |
+
|
| 147 |
+
def _is_noise_sentence(self, sentence: str) -> bool:
|
| 148 |
+
"""Check if a sentence is likely noise (navigation, ads, etc.)."""
|
| 149 |
+
noise_patterns = [
|
| 150 |
+
r"^(click|tap|read|view|see|watch|follow|subscribe)",
|
| 151 |
+
r"(cookie|privacy|terms|conditions|policy)",
|
| 152 |
+
r"(advertisement|sponsored|ad)",
|
| 153 |
+
r"(Β©|copyright|\u00a9)",
|
| 154 |
+
r"^(home|about|contact|menu|search)",
|
| 155 |
+
r"(javascript|enable|browser|update)",
|
| 156 |
+
r"^[\W\d\s]*$",
|
| 157 |
+
r"(share|like|comment|subscribe)",
|
| 158 |
+
r"(login|sign\s+in|register)",
|
| 159 |
+
r"(loading|please\s+wait)",
|
| 160 |
+
# Add PDF-specific noise patterns
|
| 161 |
+
r"(pdf|download|file|document)\s*(viewer|reader)",
|
| 162 |
+
r"(page|pages)\s*\d+\s*(of|\/)\s*\d+",
|
| 163 |
+
r"(adobe|acrobat|reader)",
|
| 164 |
+
]
|
| 165 |
+
sentence_lower = sentence.lower()
|
| 166 |
+
return any(re.search(pattern, sentence_lower) for pattern in noise_patterns)
|
| 167 |
+
|
| 168 |
+
def _extract_relevant_sentences(self, content: str) -> List[str]:
|
| 169 |
+
"""Extract relevant sentences using TF-IDF vectorization."""
|
| 170 |
+
if not content or len(content.strip()) < 50:
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
# Check if content appears to be corrupted PDF
|
| 174 |
+
if _is_corrupted_pdf_content(content):
|
| 175 |
+
logging.warning("π« Content appears to be corrupted PDF - skipping")
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
sentences = sent_tokenize(content)
|
| 179 |
+
|
| 180 |
+
# Enhanced filtering pipeline
|
| 181 |
+
valid_sentences = []
|
| 182 |
+
for sentence in sentences:
|
| 183 |
+
if self._is_valid_sentence(sentence) and not self._is_noise_sentence(
|
| 184 |
+
sentence
|
| 185 |
+
):
|
| 186 |
+
valid_sentences.append(sentence.strip())
|
| 187 |
+
|
| 188 |
+
if not valid_sentences:
|
| 189 |
+
logging.warning("No valid sentences found after filtering")
|
| 190 |
+
return []
|
| 191 |
+
|
| 192 |
+
return valid_sentences
|
| 193 |
+
|
| 194 |
+
def _get_user_agent(self) -> str:
|
| 195 |
+
ua = self.user_agents[self.current_ua_index]
|
| 196 |
+
self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)
|
| 197 |
+
return ua
|
| 198 |
+
|
| 199 |
+
def _cache_key(self, text: str) -> str:
|
| 200 |
+
return hashlib.md5(text.encode()).hexdigest()
|
| 201 |
+
|
| 202 |
+
def _add_to_cache(self, key: str, result: Dict):
|
| 203 |
+
if len(self.claim_cache) >= self.cache_size:
|
| 204 |
+
oldest_key = next(iter(self.claim_cache))
|
| 205 |
+
del self.claim_cache[oldest_key]
|
| 206 |
+
self.claim_cache[key] = result
|
| 207 |
+
|
| 208 |
+
def _get_from_cache(self, key: str) -> Optional[Dict]:
|
| 209 |
+
return self.claim_cache.get(key)
|
| 210 |
+
|
| 211 |
+
def _semantic_similarity_with_sentences(
|
| 212 |
+
self, claim: str, sentences: List[str]
|
| 213 |
+
) -> float:
|
| 214 |
+
"""Calculate entailment scores and return the best one."""
|
| 215 |
+
try:
|
| 216 |
+
score = calculate_semantic_similarity(claim, sentences)
|
| 217 |
+
except Exception as e:
|
| 218 |
+
logging.error(f"Error analyzing sentence: {e}")
|
| 219 |
+
return score
|
| 220 |
+
|
| 221 |
+
def verify_claim_against_sources(
|
| 222 |
+
self, claim: str, search_results: List[str]
|
| 223 |
+
) -> Dict:
|
| 224 |
+
logging.info(f"\nVerifying Claim: '{claim}'...")
|
| 225 |
+
|
| 226 |
+
cache_key = self._cache_key(f"verify_{claim}")
|
| 227 |
+
if cached_result := self._get_from_cache(cache_key):
|
| 228 |
+
logging.info("π Using cached result")
|
| 229 |
+
return cached_result
|
| 230 |
+
|
| 231 |
+
prioritized_sources = self._prioritize_sources(search_results)
|
| 232 |
+
|
| 233 |
+
if not prioritized_sources:
|
| 234 |
+
logging.warning("β οΈ No valid sources available after filtering")
|
| 235 |
+
return {
|
| 236 |
+
"score": 0.3,
|
| 237 |
+
"total_sources_processed": 0,
|
| 238 |
+
"support_sum": 0.0,
|
| 239 |
+
"total_weight": 0.0,
|
| 240 |
+
"source_details": [],
|
| 241 |
+
"warning": "No valid sources available after filtering PDFs/downloads",
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
support_scores = []
|
| 245 |
+
total_weight = 0.0
|
| 246 |
+
source_details = []
|
| 247 |
+
|
| 248 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 249 |
+
future_to_url = {
|
| 250 |
+
executor.submit(self._analyze_url, url, claim): url
|
| 251 |
+
for url in prioritized_sources
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
try:
|
| 255 |
+
for future in as_completed(future_to_url, timeout=45):
|
| 256 |
+
url = future_to_url[future]
|
| 257 |
+
try:
|
| 258 |
+
if result := future.result(timeout=15):
|
| 259 |
+
similarity_score, domain_weight, domain_type, sentences = (
|
| 260 |
+
result
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Enhanced Logging Format
|
| 264 |
+
logging.info(f"\nSource: {url} ({domain_type})")
|
| 265 |
+
# logging.info(
|
| 266 |
+
# f" - Relevant Sentences: {sentences[:3]}"
|
| 267 |
+
# ) # Log first 2 sentences
|
| 268 |
+
logging.info(
|
| 269 |
+
f" - Entailment Score: {similarity_score:.2f}"
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
total_weight += domain_weight
|
| 273 |
+
if similarity_score >= 0.4:
|
| 274 |
+
support_scores.append(similarity_score * domain_weight)
|
| 275 |
+
|
| 276 |
+
source_details.append(
|
| 277 |
+
{
|
| 278 |
+
"url": url,
|
| 279 |
+
"semantic_similarity": similarity_score,
|
| 280 |
+
"domain_weight": domain_weight,
|
| 281 |
+
"domain_type": domain_type,
|
| 282 |
+
"relevant_sentences": sentences[:3],
|
| 283 |
+
}
|
| 284 |
+
)
|
| 285 |
+
except Exception as e:
|
| 286 |
+
logging.error(f"Error processing {url}: {e}")
|
| 287 |
+
except TimeoutError:
|
| 288 |
+
logging.warning("β° Timeout: Some URLs were skipped.")
|
| 289 |
+
|
| 290 |
+
support_sum = sum(support_scores)
|
| 291 |
+
|
| 292 |
+
if total_weight > 0:
|
| 293 |
+
final_score = min(1.0, support_sum / len(support_scores))
|
| 294 |
+
# Adjustments
|
| 295 |
+
# if final_score < 0.5 and support_sum < 0.5:
|
| 296 |
+
# final_score *= 0.8
|
| 297 |
+
# elif final_score > 0.5 and support_sum >= 1.0:
|
| 298 |
+
# final_score = min(0.9, final_score * 1.1)
|
| 299 |
+
else:
|
| 300 |
+
final_score = 0.1
|
| 301 |
+
|
| 302 |
+
final_score = max(0.0, min(1.0, final_score))
|
| 303 |
+
logging.info(
|
| 304 |
+
f"\n{'='*20}\nπ Final Verification Score: {final_score:.2f}\n{'='*20}"
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
result = {
|
| 308 |
+
"score": final_score,
|
| 309 |
+
"total_sources_processed": len(source_details),
|
| 310 |
+
"support_sum": support_sum,
|
| 311 |
+
"total_weight": total_weight,
|
| 312 |
+
"source_details": source_details,
|
| 313 |
+
}
|
| 314 |
+
self._add_to_cache(cache_key, result)
|
| 315 |
+
return result
|
| 316 |
+
|
| 317 |
+
def _analyze_url(
|
| 318 |
+
self, url: str, claim: str
|
| 319 |
+
) -> Optional[Tuple[float, float, str, List[str]]]:
|
| 320 |
+
try:
|
| 321 |
+
# Double-check for PDFs at analysis time (in case some slipped through)
|
| 322 |
+
if _is_pdf_or_download_url(url):
|
| 323 |
+
logging.info(f"π« Skipping PDF/download URL at analysis time: {url}")
|
| 324 |
+
return None
|
| 325 |
+
|
| 326 |
+
cache_key = self._cache_key(url)
|
| 327 |
+
content = extract_content(
|
| 328 |
+
url,
|
| 329 |
+
self.content_cache,
|
| 330 |
+
cache_key,
|
| 331 |
+
self._get_user_agent,
|
| 332 |
+
self.timeout,
|
| 333 |
+
self.cache_size,
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
if not content or len(content.strip()) < 50:
|
| 337 |
+
return None
|
| 338 |
+
|
| 339 |
+
# Check for corrupted PDF content
|
| 340 |
+
if _is_corrupted_pdf_content(content):
|
| 341 |
+
logging.warning(f"π« Skipping corrupted PDF content from: {url}")
|
| 342 |
+
return None
|
| 343 |
+
|
| 344 |
+
# Used for sentence extraction instead of embeddings
|
| 345 |
+
relevant_sentences = self._extract_relevant_sentences(content)
|
| 346 |
+
|
| 347 |
+
if not relevant_sentences:
|
| 348 |
+
return None
|
| 349 |
+
|
| 350 |
+
# cleaned_content = ""
|
| 351 |
+
# for sentence in relevant_sentences:
|
| 352 |
+
# if (
|
| 353 |
+
# sentence.endswith(".")
|
| 354 |
+
# or sentence.endswith("?")
|
| 355 |
+
# or sentence.endswith("!")
|
| 356 |
+
# ):
|
| 357 |
+
# cleaned_content += f"{sentence} "
|
| 358 |
+
# else:
|
| 359 |
+
# cleaned_content += f"{sentence}. "
|
| 360 |
+
|
| 361 |
+
semantic_similarity = self._semantic_similarity_with_sentences(
|
| 362 |
+
claim, relevant_sentences
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
domain_weight, domain_type = self._get_domain_weight(url)
|
| 366 |
+
# print(f"relevant_sentences: {cleaned_content}")
|
| 367 |
+
|
| 368 |
+
return semantic_similarity, domain_weight, domain_type, relevant_sentences
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logging.error(f"Failed to analyze URL {url}: {e}")
|
| 371 |
+
return None
|
deploy/main/network_analyzer.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from collections import Counter
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
from deploy.utils.general_utils import (
|
| 6 |
+
TRUSTED_DOMAINS,
|
| 7 |
+
SUSPICIOUS_DOMAINS,
|
| 8 |
+
extract_domain,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
SOCIAL_AGGREGATOR_DOMAINS = {
|
| 12 |
+
"facebook.com",
|
| 13 |
+
"twitter.com",
|
| 14 |
+
"reddit.com",
|
| 15 |
+
"youtube.com",
|
| 16 |
+
"instagram.com",
|
| 17 |
+
"tiktok.com",
|
| 18 |
+
"google.com",
|
| 19 |
+
"yahoo.com",
|
| 20 |
+
"msn.com",
|
| 21 |
+
"aol.com",
|
| 22 |
+
"linkedin.com",
|
| 23 |
+
"pinterest.com",
|
| 24 |
+
"snapchat.com",
|
| 25 |
+
"discord.com",
|
| 26 |
+
"telegram.org",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
CONTENT_FARM_DOMAINS = {
|
| 30 |
+
"buzzfeed.com",
|
| 31 |
+
"clickhole.com",
|
| 32 |
+
"upworthy.com",
|
| 33 |
+
"viralthread.com",
|
| 34 |
+
"shareably.net",
|
| 35 |
+
"littlethings.com",
|
| 36 |
+
"providr.com",
|
| 37 |
+
"shared.com",
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class NetworkAnalyzer:
|
| 42 |
+
"""Propagation pattern analyzer - returns only score, and domain_diversity"""
|
| 43 |
+
|
| 44 |
+
def __init__(self):
|
| 45 |
+
# Scoring weights
|
| 46 |
+
self.weights = {"domain_credibility": 0.60, "diversity_quality": 0.40}
|
| 47 |
+
self.min_sources_threshold = 3
|
| 48 |
+
self.min_unique_domains = 2
|
| 49 |
+
|
| 50 |
+
def _calculate_domain_credibility_score(self, domains: List[str]) -> float:
|
| 51 |
+
"""Calculate domain credibility score"""
|
| 52 |
+
if not domains:
|
| 53 |
+
return 0.0
|
| 54 |
+
|
| 55 |
+
domain_counts = Counter(domains)
|
| 56 |
+
total_sources = len(domains)
|
| 57 |
+
|
| 58 |
+
# Categorize domains
|
| 59 |
+
trusted_count = sum(
|
| 60 |
+
count
|
| 61 |
+
for domain, count in domain_counts.items()
|
| 62 |
+
if domain in TRUSTED_DOMAINS
|
| 63 |
+
)
|
| 64 |
+
suspicious_count = sum(
|
| 65 |
+
count
|
| 66 |
+
for domain, count in domain_counts.items()
|
| 67 |
+
if domain in SUSPICIOUS_DOMAINS
|
| 68 |
+
)
|
| 69 |
+
social_count = sum(
|
| 70 |
+
count
|
| 71 |
+
for domain, count in domain_counts.items()
|
| 72 |
+
if domain in SOCIAL_AGGREGATOR_DOMAINS
|
| 73 |
+
)
|
| 74 |
+
content_farm_count = sum(
|
| 75 |
+
count
|
| 76 |
+
for domain, count in domain_counts.items()
|
| 77 |
+
if domain in CONTENT_FARM_DOMAINS
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Calculate ratios
|
| 81 |
+
trusted_ratio = trusted_count / total_sources
|
| 82 |
+
suspicious_ratio = suspicious_count / total_sources
|
| 83 |
+
social_ratio = social_count / total_sources
|
| 84 |
+
content_farm_ratio = content_farm_count / total_sources
|
| 85 |
+
unknown_ratio = 1 - (
|
| 86 |
+
trusted_ratio + suspicious_ratio + social_ratio + content_farm_ratio
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Calculate score
|
| 90 |
+
base_score = 0.15
|
| 91 |
+
score = base_score
|
| 92 |
+
score += trusted_ratio * 0.6
|
| 93 |
+
score -= suspicious_ratio * 0.8
|
| 94 |
+
score -= content_farm_ratio * 0.4
|
| 95 |
+
score += social_ratio * 0.1
|
| 96 |
+
score -= unknown_ratio * 0.2
|
| 97 |
+
|
| 98 |
+
# Additional penalties
|
| 99 |
+
if suspicious_ratio > 0.5:
|
| 100 |
+
score -= 0.3
|
| 101 |
+
if trusted_count == 0 and total_sources > 5:
|
| 102 |
+
score -= 0.2
|
| 103 |
+
if content_farm_ratio > 0.4:
|
| 104 |
+
score -= 0.15
|
| 105 |
+
|
| 106 |
+
return max(0.0, min(1.0, score))
|
| 107 |
+
|
| 108 |
+
def _calculate_diversity_quality(self, domains: List[str]) -> Dict:
|
| 109 |
+
"""Calculate diversity quality - returns score and entropy
|
| 110 |
+
Entropy here is a statistical measure of domain diversity,
|
| 111 |
+
helping to assess whether a claimβs spread is broad and
|
| 112 |
+
organic or narrow and potentially suspicious.
|
| 113 |
+
"""
|
| 114 |
+
if len(domains) < 2:
|
| 115 |
+
return {"score": 0.0, "entropy": 0.0}
|
| 116 |
+
|
| 117 |
+
domain_counts = Counter(domains)
|
| 118 |
+
unique_domains = len(set(domains))
|
| 119 |
+
total_sources = len(domains)
|
| 120 |
+
|
| 121 |
+
# Calculate Shannon entropy
|
| 122 |
+
entropy = 0.0
|
| 123 |
+
for count in domain_counts.values():
|
| 124 |
+
p = count / total_sources
|
| 125 |
+
if p > 0:
|
| 126 |
+
entropy -= p * math.log2(p)
|
| 127 |
+
|
| 128 |
+
# Normalize entropy
|
| 129 |
+
max_entropy = math.log2(unique_domains) if unique_domains > 1 else 0
|
| 130 |
+
normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
|
| 131 |
+
|
| 132 |
+
# Base diversity score
|
| 133 |
+
diversity_score = normalized_entropy
|
| 134 |
+
|
| 135 |
+
# Detect artificial patterns
|
| 136 |
+
max_domain_share = max(domain_counts.values()) / total_sources
|
| 137 |
+
|
| 138 |
+
# Single domain dominance penalty
|
| 139 |
+
if max_domain_share > 0.7 and unique_domains > 3:
|
| 140 |
+
diversity_score -= 0.4
|
| 141 |
+
|
| 142 |
+
# Artificial diversity penalty
|
| 143 |
+
single_mention_domains = sum(
|
| 144 |
+
1 for count in domain_counts.values() if count == 1
|
| 145 |
+
)
|
| 146 |
+
if single_mention_domains > total_sources * 0.8 and total_sources > 10:
|
| 147 |
+
diversity_score -= 0.3
|
| 148 |
+
|
| 149 |
+
if 0.3 <= normalized_entropy <= 0.8 and unique_domains >= 3:
|
| 150 |
+
diversity_score += 0.2
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
"score": max(0.0, min(1.0, diversity_score)),
|
| 154 |
+
"entropy": normalized_entropy,
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
def analyze_propagation_pattern(self, search_results: List[str]) -> Dict:
|
| 158 |
+
"""Analyze propagation pattern - returns score, and domain_diversity"""
|
| 159 |
+
domains = []
|
| 160 |
+
valid_urls = 0
|
| 161 |
+
|
| 162 |
+
for url in search_results:
|
| 163 |
+
domain = extract_domain(url)
|
| 164 |
+
if domain and domain not in ["", "localhost"]:
|
| 165 |
+
domains.append(domain)
|
| 166 |
+
valid_urls += 1
|
| 167 |
+
|
| 168 |
+
# Early return for insufficient data
|
| 169 |
+
if len(domains) < self.min_sources_threshold:
|
| 170 |
+
return {"score": 0.1, "domain_diversity": 0.0}
|
| 171 |
+
|
| 172 |
+
# Perform analysis
|
| 173 |
+
credibility_score = self._calculate_domain_credibility_score(domains)
|
| 174 |
+
diversity_analysis = self._calculate_diversity_quality(domains)
|
| 175 |
+
|
| 176 |
+
# Calculate weighted final score
|
| 177 |
+
final_score = (
|
| 178 |
+
credibility_score * self.weights["domain_credibility"]
|
| 179 |
+
+ diversity_analysis["score"] * self.weights["diversity_quality"]
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Additional quality adjustments
|
| 183 |
+
unique_domains = len(set(domains))
|
| 184 |
+
trusted_count = sum(1 for d in domains if d in TRUSTED_DOMAINS)
|
| 185 |
+
suspicious_count = sum(1 for d in domains if d in SUSPICIOUS_DOMAINS)
|
| 186 |
+
|
| 187 |
+
if trusted_count >= 3 and suspicious_count == 0:
|
| 188 |
+
final_score += 0.1
|
| 189 |
+
elif suspicious_count > trusted_count:
|
| 190 |
+
final_score -= 0.15
|
| 191 |
+
|
| 192 |
+
if unique_domains < self.min_unique_domains:
|
| 193 |
+
final_score = min(final_score, 0.3)
|
| 194 |
+
|
| 195 |
+
final_score = max(0.0, min(1.0, final_score))
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
"score": round(final_score, 3),
|
| 199 |
+
"domain_diversity": round(diversity_analysis["entropy"], 3),
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
if __name__ == "__main__":
|
| 204 |
+
analyzer = NetworkAnalyzer()
|
| 205 |
+
|
| 206 |
+
# Test Case 1: Mixed credible and suspicious domains
|
| 207 |
+
search_results_1 = [
|
| 208 |
+
"https://reuters.com/news/article1",
|
| 209 |
+
"https://bbc.com/news/article2",
|
| 210 |
+
"https://ghanaweb.com/article3",
|
| 211 |
+
"https://cnn.com/article4",
|
| 212 |
+
"https://naturalnews.com/fake1",
|
| 213 |
+
"https://infowars.com/fake2",
|
| 214 |
+
]
|
| 215 |
+
print("\nTest Case 1: Mixed credible and suspicious")
|
| 216 |
+
result1 = analyzer.analyze_propagation_pattern(search_results_1)
|
| 217 |
+
print(f"Result: {result1}")
|
| 218 |
+
|
| 219 |
+
# Test Case 2: Mostly trusted domains
|
| 220 |
+
search_results_2 = [
|
| 221 |
+
"https://bbc.com/article",
|
| 222 |
+
"https://cnn.com/article",
|
| 223 |
+
"https://reuters.com/article",
|
| 224 |
+
"https://nytimes.com/article",
|
| 225 |
+
"https://ghanaweb.com/article",
|
| 226 |
+
]
|
| 227 |
+
print("\nTest Case 2: Mostly trusted domains")
|
| 228 |
+
result2 = analyzer.analyze_propagation_pattern(search_results_2)
|
| 229 |
+
print(f"Result: {result2}")
|
| 230 |
+
|
| 231 |
+
# Test Case 3: Mostly suspicious and content farms
|
| 232 |
+
search_results_3 = [
|
| 233 |
+
"https://infowars.com/fake",
|
| 234 |
+
"https://naturalnews.com/fake",
|
| 235 |
+
"https://clickhole.com/funny",
|
| 236 |
+
"https://upworthy.com/clickbait",
|
| 237 |
+
"https://shared.com/share",
|
| 238 |
+
]
|
| 239 |
+
print("\nTest Case 3: Suspicious and content farm heavy")
|
| 240 |
+
result3 = analyzer.analyze_propagation_pattern(search_results_3)
|
| 241 |
+
print(f"Result: {result3}")
|
| 242 |
+
|
| 243 |
+
# Test Case 4: Low diversity (same domain repeated)
|
| 244 |
+
search_results_4 = [
|
| 245 |
+
"https://buzzfeed.com/post1",
|
| 246 |
+
"https://buzzfeed.com/post2",
|
| 247 |
+
"https://buzzfeed.com/post3",
|
| 248 |
+
"https://buzzfeed.com/post4",
|
| 249 |
+
"https://buzzfeed.com/post5",
|
| 250 |
+
]
|
| 251 |
+
print("\nTest Case 4: Low domain diversity")
|
| 252 |
+
result4 = analyzer.analyze_propagation_pattern(search_results_4)
|
| 253 |
+
print(f"Result: {result4}")
|
| 254 |
+
|
| 255 |
+
# Test Case 5: Not enough sources
|
| 256 |
+
search_results_5 = ["https://cnn.com/article1"]
|
| 257 |
+
print("\nTest Case 5: Insufficient results")
|
| 258 |
+
result5 = analyzer.analyze_propagation_pattern(search_results_5)
|
| 259 |
+
print(f"Result: {result5}")
|
deploy/main/predict_clickbait.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import numpy as np
|
| 3 |
+
from deploy.utils.clickbait_utils import extract_enhanced_features
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ClickbaitPredictor:
|
| 7 |
+
def __init__(self, model_dir="./models/clickbait"):
|
| 8 |
+
try:
|
| 9 |
+
with open(f"{model_dir}/logistic_regression_model.pkl", "rb") as f:
|
| 10 |
+
self.classifier = pickle.load(f)
|
| 11 |
+
with open(f"{model_dir}/tfidf_vectorizer.pkl", "rb") as f:
|
| 12 |
+
self.tfidf_vectorizer = pickle.load(f)
|
| 13 |
+
with open(f"{model_dir}/feature_info.pkl", "rb") as f:
|
| 14 |
+
self.clickbait_indicators = pickle.load(f)
|
| 15 |
+
print("Model loaded successfully")
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"Error loading model: {e}")
|
| 18 |
+
self.classifier = None
|
| 19 |
+
self.tfidf_vectorizer = None
|
| 20 |
+
self.clickbait_indicators = None
|
| 21 |
+
|
| 22 |
+
def predict(self, headline, threshold=0.5):
|
| 23 |
+
if self.classifier is None or self.tfidf_vectorizer is None:
|
| 24 |
+
raise RuntimeError("Model or vectorizer not loaded.")
|
| 25 |
+
tfidf_features = self.tfidf_vectorizer.transform([headline])
|
| 26 |
+
handcrafted_features = extract_enhanced_features([headline])
|
| 27 |
+
combined_features = np.hstack((tfidf_features.toarray(), handcrafted_features))
|
| 28 |
+
lr_probs = self.classifier.predict_proba(combined_features)[0]
|
| 29 |
+
lr_score = lr_probs[1]
|
| 30 |
+
is_clickbait = lr_score >= threshold
|
| 31 |
+
confidence = lr_score if is_clickbait else (1 - lr_score)
|
| 32 |
+
return is_clickbait, lr_score, confidence
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
predictor = ClickbaitPredictor()
|
| 37 |
+
while True:
|
| 38 |
+
headline = input("Enter a headline to check clickbait score: ")
|
| 39 |
+
is_clickbait, score, confidence = predictor.predict(headline)
|
| 40 |
+
status = "CLICKBAIT" if is_clickbait else "NORMAL"
|
| 41 |
+
print(f"{status} (Score: {score:.3f}, Confidence: {confidence:.3f})")
|
| 42 |
+
print(f" '{headline}'")
|
| 43 |
+
print()
|
deploy/main/source_credibility_analyzer.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from urllib.parse import urlparse
|
| 3 |
+
|
| 4 |
+
from deploy.utils.general_utils import TRUSTED_DOMAINS, SUSPICIOUS_DOMAINS
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SourceCredibilityAnalyzer:
|
| 8 |
+
"""Simplified source credibility analyzer - returns only the score"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
# Weighted scoring system
|
| 12 |
+
self.weights = {
|
| 13 |
+
"tld_credibility": 0.4,
|
| 14 |
+
"domain_structure": 0.3,
|
| 15 |
+
"news_indicators": 0.2,
|
| 16 |
+
"domain_age_indicators": 0.15,
|
| 17 |
+
"subdomain_analysis": 0.1,
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Suspicious patterns
|
| 21 |
+
self._suspicious_patterns = [
|
| 22 |
+
(re.compile(r"\d{4,}"), 0.8),
|
| 23 |
+
(
|
| 24 |
+
re.compile(r"(fake|hoax|scam|click|bait|spam|phishing)", re.IGNORECASE),
|
| 25 |
+
0.9,
|
| 26 |
+
),
|
| 27 |
+
(re.compile(r"[a-z]+\d+[a-z]+\d+", re.IGNORECASE), 0.7),
|
| 28 |
+
(re.compile(r"(xxx|porn|adult|sex)", re.IGNORECASE), 0.6),
|
| 29 |
+
(re.compile(r"(free|download|crack|hack)", re.IGNORECASE), 0.5),
|
| 30 |
+
(re.compile(r"[0-9]{1,3}-[0-9]{1,3}-[0-9]{1,3}", re.IGNORECASE), 0.8),
|
| 31 |
+
(re.compile(r"(temp|tmp|test|demo)", re.IGNORECASE), 0.4),
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
# TLD scores
|
| 35 |
+
self.tld_scores = {
|
| 36 |
+
".edu": 0.9,
|
| 37 |
+
".gov": 0.95,
|
| 38 |
+
".mil": 0.9,
|
| 39 |
+
".org": 0.7,
|
| 40 |
+
".ac.uk": 0.8,
|
| 41 |
+
".edu.au": 0.8,
|
| 42 |
+
".com": 0.3,
|
| 43 |
+
".net": 0.25,
|
| 44 |
+
".co.uk": 0.4,
|
| 45 |
+
".com.au": 0.4,
|
| 46 |
+
".de": 0.4,
|
| 47 |
+
".fr": 0.4,
|
| 48 |
+
".ca": 0.4,
|
| 49 |
+
".jp": 0.4,
|
| 50 |
+
".info": 0.1,
|
| 51 |
+
".biz": 0.1,
|
| 52 |
+
".name": 0.05,
|
| 53 |
+
".tk": -0.6,
|
| 54 |
+
".ml": -0.6,
|
| 55 |
+
".ga": -0.6,
|
| 56 |
+
".cf": -0.6,
|
| 57 |
+
".pw": -0.4,
|
| 58 |
+
".top": -0.3,
|
| 59 |
+
".click": -0.5,
|
| 60 |
+
".download": -0.4,
|
| 61 |
+
".stream": -0.3,
|
| 62 |
+
".review": -0.2,
|
| 63 |
+
".date": -0.3,
|
| 64 |
+
".racing": -0.4,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
# News indicators
|
| 68 |
+
self.news_indicators = {
|
| 69 |
+
"news": 0.3,
|
| 70 |
+
"times": 0.3,
|
| 71 |
+
"post": 0.25,
|
| 72 |
+
"herald": 0.2,
|
| 73 |
+
"gazette": 0.2,
|
| 74 |
+
"journal": 0.2,
|
| 75 |
+
"tribune": 0.2,
|
| 76 |
+
"chronicle": 0.2,
|
| 77 |
+
"report": 0.15,
|
| 78 |
+
"press": 0.2,
|
| 79 |
+
"media": 0.1,
|
| 80 |
+
"broadcast": 0.15,
|
| 81 |
+
"reuters": 0.4,
|
| 82 |
+
"associated": 0.3,
|
| 83 |
+
"wire": 0.2,
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
def analyze_domain_credibility(self, domain: str) -> float:
|
| 87 |
+
"""Get credibility score for domain"""
|
| 88 |
+
domain = domain.lower().strip()
|
| 89 |
+
|
| 90 |
+
# Handle URLs by extracting domain
|
| 91 |
+
if domain.startswith(("http://", "https://")):
|
| 92 |
+
parsed = urlparse(domain)
|
| 93 |
+
domain = parsed.netloc.lower()
|
| 94 |
+
|
| 95 |
+
# Remove www prefix
|
| 96 |
+
if domain.startswith("www."):
|
| 97 |
+
domain = domain[4:]
|
| 98 |
+
|
| 99 |
+
# Check trusted domains
|
| 100 |
+
if domain in TRUSTED_DOMAINS:
|
| 101 |
+
return 0.95
|
| 102 |
+
|
| 103 |
+
# Check suspicious domains
|
| 104 |
+
if domain in SUSPICIOUS_DOMAINS:
|
| 105 |
+
return 0.05
|
| 106 |
+
|
| 107 |
+
# Calculate score components
|
| 108 |
+
tld_score = self._get_tld_score(domain)
|
| 109 |
+
structure_score = self._get_structure_score(domain)
|
| 110 |
+
news_score = self._get_news_score(domain)
|
| 111 |
+
establishment_score = self._get_establishment_score(domain)
|
| 112 |
+
subdomain_score = self._get_subdomain_score(domain)
|
| 113 |
+
|
| 114 |
+
# Start with base score and apply weighted components
|
| 115 |
+
base_score = 0.2
|
| 116 |
+
final_score = base_score
|
| 117 |
+
final_score += tld_score * self.weights["tld_credibility"]
|
| 118 |
+
final_score += structure_score * self.weights["domain_structure"]
|
| 119 |
+
final_score += news_score * self.weights["news_indicators"]
|
| 120 |
+
final_score += establishment_score * self.weights["domain_age_indicators"]
|
| 121 |
+
final_score += subdomain_score * self.weights["subdomain_analysis"]
|
| 122 |
+
|
| 123 |
+
return max(0.0, min(1.0, round(final_score, 2)))
|
| 124 |
+
|
| 125 |
+
def _get_tld_score(self, domain: str) -> float:
|
| 126 |
+
"""Get TLD score"""
|
| 127 |
+
for tld, score in self.tld_scores.items():
|
| 128 |
+
if domain.endswith(tld):
|
| 129 |
+
return score
|
| 130 |
+
return -0.1 # Unknown TLD
|
| 131 |
+
|
| 132 |
+
def _get_structure_score(self, domain: str) -> float:
|
| 133 |
+
"""Get domain structure score"""
|
| 134 |
+
suspicious_score = 0
|
| 135 |
+
|
| 136 |
+
for pattern, severity in self._suspicious_patterns:
|
| 137 |
+
if pattern.search(domain):
|
| 138 |
+
suspicious_score -= severity * 0.3
|
| 139 |
+
|
| 140 |
+
if len(domain.split(".")[0]) < 3:
|
| 141 |
+
suspicious_score -= 0.2
|
| 142 |
+
|
| 143 |
+
if domain.count("-") > 2:
|
| 144 |
+
suspicious_score -= 0.15
|
| 145 |
+
|
| 146 |
+
return max(-0.8, suspicious_score)
|
| 147 |
+
|
| 148 |
+
def _get_news_score(self, domain: str) -> float:
|
| 149 |
+
"""Get news indicators score"""
|
| 150 |
+
score = 0
|
| 151 |
+
for indicator, weight in self.news_indicators.items():
|
| 152 |
+
if indicator in domain:
|
| 153 |
+
score += weight
|
| 154 |
+
return min(0.4, score)
|
| 155 |
+
|
| 156 |
+
def _get_establishment_score(self, domain: str) -> float:
|
| 157 |
+
"""Get establishment indicators score"""
|
| 158 |
+
score = 0
|
| 159 |
+
|
| 160 |
+
if any(
|
| 161 |
+
word in domain
|
| 162 |
+
for word in ["university", "college", "institute", "foundation"]
|
| 163 |
+
):
|
| 164 |
+
score += 0.3
|
| 165 |
+
|
| 166 |
+
if any(word in domain for word in ["library", "museum", "archive"]):
|
| 167 |
+
score += 0.2
|
| 168 |
+
|
| 169 |
+
if any(word in domain for word in ["research", "study", "science"]):
|
| 170 |
+
score += 0.15
|
| 171 |
+
|
| 172 |
+
return min(0.3, score)
|
| 173 |
+
|
| 174 |
+
def _get_subdomain_score(self, domain: str) -> float:
|
| 175 |
+
"""Get subdomain score"""
|
| 176 |
+
parts = domain.split(".")
|
| 177 |
+
|
| 178 |
+
if len(parts) <= 2:
|
| 179 |
+
return 0.1
|
| 180 |
+
elif len(parts) > 4:
|
| 181 |
+
return -0.15
|
| 182 |
+
else:
|
| 183 |
+
return 0
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
analyzer = SourceCredibilityAnalyzer()
|
| 188 |
+
# domains_to_analyze = ["ghanaweb.com"]
|
| 189 |
+
domain = input("Enter a domain to check credibility: ")
|
| 190 |
+
# for domain in domains_to_analyze:
|
| 191 |
+
result = analyzer.analyze_domain_credibility(domain)
|
| 192 |
+
print(f"{domain} -> {result:.2f}")
|
deploy/utils/__init__.py
ADDED
|
File without changes
|
deploy/utils/clickbait_utils.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
clickbait_indicators = {
|
| 5 |
+
"curiosity_gap": [
|
| 6 |
+
"you won't believe",
|
| 7 |
+
"wait until you see",
|
| 8 |
+
"what happened next",
|
| 9 |
+
"the reason will shock you",
|
| 10 |
+
"this is why",
|
| 11 |
+
"here's what happened",
|
| 12 |
+
"the truth about",
|
| 13 |
+
"what nobody tells you",
|
| 14 |
+
"finally revealed",
|
| 15 |
+
],
|
| 16 |
+
"emotional_triggers": [
|
| 17 |
+
"shocking",
|
| 18 |
+
"incredible",
|
| 19 |
+
"amazing",
|
| 20 |
+
"unbelievable",
|
| 21 |
+
"stunning",
|
| 22 |
+
"heartbreaking",
|
| 23 |
+
"hilarious",
|
| 24 |
+
"terrifying",
|
| 25 |
+
"adorable",
|
| 26 |
+
"outrageous",
|
| 27 |
+
"mind-blowing",
|
| 28 |
+
"jaw-dropping",
|
| 29 |
+
"breathtaking",
|
| 30 |
+
],
|
| 31 |
+
"urgency_scarcity": [
|
| 32 |
+
"breaking",
|
| 33 |
+
"urgent",
|
| 34 |
+
"limited time",
|
| 35 |
+
"before it's too late",
|
| 36 |
+
"act now",
|
| 37 |
+
"don't miss",
|
| 38 |
+
"last chance",
|
| 39 |
+
"expires soon",
|
| 40 |
+
],
|
| 41 |
+
"personal_relevance": [
|
| 42 |
+
"in your area",
|
| 43 |
+
"people like you",
|
| 44 |
+
"your age",
|
| 45 |
+
"based on your",
|
| 46 |
+
"you need to know",
|
| 47 |
+
"this affects you",
|
| 48 |
+
"for people who",
|
| 49 |
+
],
|
| 50 |
+
"superlatives": [
|
| 51 |
+
"ultimate",
|
| 52 |
+
"perfect",
|
| 53 |
+
"best ever",
|
| 54 |
+
"greatest",
|
| 55 |
+
"worst",
|
| 56 |
+
"most amazing",
|
| 57 |
+
"incredible",
|
| 58 |
+
"unmatched",
|
| 59 |
+
"revolutionary",
|
| 60 |
+
],
|
| 61 |
+
"numbers_lists": [
|
| 62 |
+
r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
|
| 63 |
+
r"one\s+(weird|simple|amazing)\s+trick",
|
| 64 |
+
r"\d+\s+minute[s]?",
|
| 65 |
+
r"in\s+\d+\s+(steps?|minutes?|days?)",
|
| 66 |
+
],
|
| 67 |
+
"authority_social_proof": [
|
| 68 |
+
"doctors hate",
|
| 69 |
+
"experts don't want",
|
| 70 |
+
"celebrities use",
|
| 71 |
+
"scientists discovered",
|
| 72 |
+
"research shows",
|
| 73 |
+
"studies prove",
|
| 74 |
+
],
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def extract_enhanced_features(texts):
|
| 79 |
+
"""Extract comprehensive handcrafted features"""
|
| 80 |
+
features = []
|
| 81 |
+
|
| 82 |
+
for text in texts:
|
| 83 |
+
if not isinstance(text, str):
|
| 84 |
+
text = str(text) if text is not None else ""
|
| 85 |
+
|
| 86 |
+
text_lower = text.lower()
|
| 87 |
+
feature_vector = []
|
| 88 |
+
|
| 89 |
+
# Clickbait pattern scores by category
|
| 90 |
+
for category, patterns in clickbait_indicators.items():
|
| 91 |
+
category_score = 0
|
| 92 |
+
for pattern in patterns:
|
| 93 |
+
if isinstance(pattern, str):
|
| 94 |
+
if pattern in text_lower:
|
| 95 |
+
category_score += 1
|
| 96 |
+
else: # regex pattern
|
| 97 |
+
if re.search(pattern, text_lower):
|
| 98 |
+
category_score += 1
|
| 99 |
+
|
| 100 |
+
# Normalize by pattern count in category
|
| 101 |
+
normalized_score = min(category_score / len(patterns), 1.0)
|
| 102 |
+
feature_vector.append(normalized_score)
|
| 103 |
+
|
| 104 |
+
# Punctuation and formatting features
|
| 105 |
+
exclamation_ratio = text.count("!") / max(len(text), 1)
|
| 106 |
+
question_ratio = text.count("?") / max(len(text), 1)
|
| 107 |
+
caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
|
| 108 |
+
|
| 109 |
+
feature_vector.extend(
|
| 110 |
+
[
|
| 111 |
+
min(exclamation_ratio * 10, 1.0),
|
| 112 |
+
min(question_ratio * 10, 1.0),
|
| 113 |
+
min(caps_ratio * 5, 1.0),
|
| 114 |
+
]
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Length and structure features
|
| 118 |
+
words = text.split()
|
| 119 |
+
word_count = len(words)
|
| 120 |
+
avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
|
| 121 |
+
|
| 122 |
+
feature_vector.extend(
|
| 123 |
+
[
|
| 124 |
+
min(word_count / 20, 1.0), # Normalized word count
|
| 125 |
+
min(avg_word_length / 8, 1.0), # Normalized avg word length
|
| 126 |
+
1.0 if word_count > 10 else 0.0, # Long headline indicator
|
| 127 |
+
]
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Semantic features
|
| 131 |
+
all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
|
| 132 |
+
number_count = len(
|
| 133 |
+
[word for word in words if any(char.isdigit() for char in word)]
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
feature_vector.extend(
|
| 137 |
+
[
|
| 138 |
+
min(all_caps_words / max(word_count, 1), 1.0),
|
| 139 |
+
min(number_count / max(word_count, 1), 1.0),
|
| 140 |
+
]
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
features.append(feature_vector)
|
| 144 |
+
|
| 145 |
+
return np.array(features)
|
deploy/utils/content_extractor.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
import requests
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from newspaper import Article
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def extract_content(
|
| 9 |
+
url: str,
|
| 10 |
+
content_cache: dict,
|
| 11 |
+
cache_key: str,
|
| 12 |
+
get_user_agent,
|
| 13 |
+
timeout: int,
|
| 14 |
+
cache_size: int,
|
| 15 |
+
) -> str:
|
| 16 |
+
"""Enhanced content extraction with newspaper3k fallback to BeautifulSoup."""
|
| 17 |
+
if cache_key in content_cache:
|
| 18 |
+
return content_cache[cache_key]
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
# Try newspaper3k first
|
| 22 |
+
article = Article(url)
|
| 23 |
+
article.download()
|
| 24 |
+
article.parse()
|
| 25 |
+
|
| 26 |
+
content = article.text
|
| 27 |
+
|
| 28 |
+
# If newspaper3k didn't get good content, fallback to BeautifulSoup
|
| 29 |
+
if not content or len(content.strip()) < 100:
|
| 30 |
+
content = _fallback_extraction(url, get_user_agent, timeout)
|
| 31 |
+
|
| 32 |
+
# Clean and normalize content
|
| 33 |
+
content = _clean_content(content)
|
| 34 |
+
content = content[:10000] # Increased from 8000
|
| 35 |
+
|
| 36 |
+
# Cache result
|
| 37 |
+
if len(content_cache) >= cache_size:
|
| 38 |
+
oldest_key = next(iter(content_cache))
|
| 39 |
+
del content_cache[oldest_key]
|
| 40 |
+
|
| 41 |
+
content_cache[cache_key] = content
|
| 42 |
+
return content
|
| 43 |
+
|
| 44 |
+
except Exception:
|
| 45 |
+
# If newspaper3k fails, try BeautifulSoup fallback
|
| 46 |
+
try:
|
| 47 |
+
content = _fallback_extraction(url, get_user_agent, timeout)
|
| 48 |
+
content = _clean_content(content)
|
| 49 |
+
content = content[:10000]
|
| 50 |
+
|
| 51 |
+
if len(content_cache) >= cache_size:
|
| 52 |
+
oldest_key = next(iter(content_cache))
|
| 53 |
+
del content_cache[oldest_key]
|
| 54 |
+
|
| 55 |
+
content_cache[cache_key] = content
|
| 56 |
+
return content
|
| 57 |
+
except Exception:
|
| 58 |
+
return ""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _fallback_extraction(url: str, get_user_agent, timeout: int) -> str:
|
| 62 |
+
"""Fallback extraction using BeautifulSoup."""
|
| 63 |
+
headers = {
|
| 64 |
+
"User-Agent": get_user_agent(),
|
| 65 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 66 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 67 |
+
"Accept-Encoding": "gzip, deflate",
|
| 68 |
+
"Connection": "keep-alive",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
time.sleep(0.5)
|
| 72 |
+
|
| 73 |
+
response = requests.get(url, headers=headers, timeout=timeout)
|
| 74 |
+
response.raise_for_status()
|
| 75 |
+
|
| 76 |
+
# Handle encoding
|
| 77 |
+
if response.encoding is None or response.encoding.lower() in ["iso-8859-1", "ascii"]:
|
| 78 |
+
response.encoding = "utf-8"
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
html_content = response.text
|
| 82 |
+
except UnicodeDecodeError:
|
| 83 |
+
try:
|
| 84 |
+
html_content = response.content.decode("utf-8", errors="ignore")
|
| 85 |
+
except UnicodeDecodeError:
|
| 86 |
+
html_content = response.content.decode("latin-1", errors="replace")
|
| 87 |
+
|
| 88 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 89 |
+
|
| 90 |
+
# Remove irrelevant content
|
| 91 |
+
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "iframe"]):
|
| 92 |
+
element.decompose()
|
| 93 |
+
|
| 94 |
+
# Extract content using selectors
|
| 95 |
+
content_selectors = [
|
| 96 |
+
"article",
|
| 97 |
+
"main",
|
| 98 |
+
'[role="main"]',
|
| 99 |
+
".content",
|
| 100 |
+
".article-content",
|
| 101 |
+
".post-content",
|
| 102 |
+
".entry-content",
|
| 103 |
+
".article-body",
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
extracted_text = ""
|
| 107 |
+
for selector in content_selectors:
|
| 108 |
+
elements = soup.select(selector)
|
| 109 |
+
if elements:
|
| 110 |
+
extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in elements])
|
| 111 |
+
break
|
| 112 |
+
|
| 113 |
+
if not extracted_text:
|
| 114 |
+
content_elements = soup.find_all(["p", "div"], class_=lambda x: x is None or "ad" not in str(x).lower())
|
| 115 |
+
extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in content_elements])
|
| 116 |
+
|
| 117 |
+
if not extracted_text:
|
| 118 |
+
extracted_text = soup.get_text(separator=" ", strip=True)
|
| 119 |
+
|
| 120 |
+
return extracted_text
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _clean_content(content: str) -> str:
|
| 124 |
+
"""Clean and normalize extracted content."""
|
| 125 |
+
# Clean problematic characters
|
| 126 |
+
content = content.replace("\ufffd", " ")
|
| 127 |
+
content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x84\x86-\x9f]", " ", content)
|
| 128 |
+
|
| 129 |
+
# Normalize unicode if available
|
| 130 |
+
try:
|
| 131 |
+
import unicodedata
|
| 132 |
+
content = unicodedata.normalize("NFKD", content)
|
| 133 |
+
except:
|
| 134 |
+
pass
|
| 135 |
+
|
| 136 |
+
# Normalize whitespace and clean
|
| 137 |
+
content = re.sub(r"\s+", " ", content).strip()
|
| 138 |
+
content = re.sub(r"[^\x20-\x7E\u00A0-\uFFFF]", " ", content)
|
| 139 |
+
|
| 140 |
+
return content
|
deploy/utils/general_utils.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tldextract
|
| 2 |
+
import re
|
| 3 |
+
from multiprocessing import Pool, cpu_count
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
|
| 6 |
+
# Trusted domains
|
| 7 |
+
TRUSTED_DOMAINS = {
|
| 8 |
+
# π International Mainstream News
|
| 9 |
+
"abcnews.go.com",
|
| 10 |
+
"aljazeera.com",
|
| 11 |
+
"apnews.com",
|
| 12 |
+
"bbc.com",
|
| 13 |
+
"bloomberg.com",
|
| 14 |
+
"cbc.ca",
|
| 15 |
+
"cbsnews.com",
|
| 16 |
+
"cnn.com",
|
| 17 |
+
"dw.com",
|
| 18 |
+
"economist.com",
|
| 19 |
+
"euronews.com",
|
| 20 |
+
"forbes.com",
|
| 21 |
+
"ft.com",
|
| 22 |
+
"indiatimes.com",
|
| 23 |
+
"japantimes.co.jp",
|
| 24 |
+
"latimes.com",
|
| 25 |
+
"npr.org",
|
| 26 |
+
"nytimes.com",
|
| 27 |
+
"reuters.com",
|
| 28 |
+
"smh.com.au",
|
| 29 |
+
"theguardian.com",
|
| 30 |
+
"usatoday.com",
|
| 31 |
+
"washingtonpost.com",
|
| 32 |
+
"wsj.com",
|
| 33 |
+
"france24.com",
|
| 34 |
+
# π° Ghana-Specific News
|
| 35 |
+
"3news.com",
|
| 36 |
+
"adomonline.com",
|
| 37 |
+
"citinewsroom.com",
|
| 38 |
+
"ghanaweb.com",
|
| 39 |
+
"ghanaiantimes.com.gh",
|
| 40 |
+
"ghananewsagency.org",
|
| 41 |
+
"graphic.com.gh",
|
| 42 |
+
"modernghana.com",
|
| 43 |
+
"myjoyonline.com",
|
| 44 |
+
"peacefmonline.com",
|
| 45 |
+
"pulse.com.gh",
|
| 46 |
+
"starrfm.com.gh",
|
| 47 |
+
"thebftonline.com",
|
| 48 |
+
"yen.com.gh",
|
| 49 |
+
"nsmq.com.gh",
|
| 50 |
+
# β½ Sports News
|
| 51 |
+
"cbssports.com",
|
| 52 |
+
"espn.com",
|
| 53 |
+
"eurosport.com",
|
| 54 |
+
"fifa.com",
|
| 55 |
+
"footballghana.com",
|
| 56 |
+
"foxsports.com",
|
| 57 |
+
"ghanasoccernet.com",
|
| 58 |
+
"goal.com",
|
| 59 |
+
"nba.com",
|
| 60 |
+
"nbcsports.com",
|
| 61 |
+
"onefootball.com",
|
| 62 |
+
"skysports.com",
|
| 63 |
+
"sportinglife.com",
|
| 64 |
+
"supersport.com",
|
| 65 |
+
"tntsports.co.uk",
|
| 66 |
+
"theathletic.com",
|
| 67 |
+
"olympics.com",
|
| 68 |
+
# π¬ Entertainment & Pop Culture
|
| 69 |
+
"billboard.com",
|
| 70 |
+
"deadline.com",
|
| 71 |
+
"entertainment.com",
|
| 72 |
+
"eonline.com",
|
| 73 |
+
"ew.com",
|
| 74 |
+
"hollywoodreporter.com",
|
| 75 |
+
"indiewire.com",
|
| 76 |
+
"people.com",
|
| 77 |
+
"rollingstone.com",
|
| 78 |
+
"thewrap.com",
|
| 79 |
+
"variety.com",
|
| 80 |
+
# π§ͺ Science & Research
|
| 81 |
+
"eurekalert.org",
|
| 82 |
+
"medpagetoday.com",
|
| 83 |
+
"nasa.gov",
|
| 84 |
+
"nature.com",
|
| 85 |
+
"sciencealert.com",
|
| 86 |
+
"sciencenews.org",
|
| 87 |
+
"statnews.com",
|
| 88 |
+
# π Fact-Checking & Watchdogs
|
| 89 |
+
"africacheck.org",
|
| 90 |
+
"factcheck.org",
|
| 91 |
+
"fullfact.org",
|
| 92 |
+
"politifact.com",
|
| 93 |
+
"snopes.com",
|
| 94 |
+
# π Global & General Niche News
|
| 95 |
+
"asia.nikkei.com",
|
| 96 |
+
"globalissues.org",
|
| 97 |
+
"ipsnews.net",
|
| 98 |
+
"oecdobserver.org",
|
| 99 |
+
"rferl.org",
|
| 100 |
+
# π° African Regional News (non-Ghana)
|
| 101 |
+
"dailynation.africa",
|
| 102 |
+
"enca.com",
|
| 103 |
+
"ewn.co.za",
|
| 104 |
+
"monitor.co.ug",
|
| 105 |
+
"thecitizen.co.tz",
|
| 106 |
+
"businessinsider.com",
|
| 107 |
+
"africanews.com",
|
| 108 |
+
# π Academic & Policy Think Tanks
|
| 109 |
+
"brookings.edu",
|
| 110 |
+
"carnegieendowment.org",
|
| 111 |
+
"cfr.org",
|
| 112 |
+
"foreignpolicy.com",
|
| 113 |
+
"theconversation.com",
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
# Suspicious domains that often spread misinformation
|
| 117 |
+
SUSPICIOUS_DOMAINS = {
|
| 118 |
+
"beforeitsnews.com",
|
| 119 |
+
"naturalnews.com",
|
| 120 |
+
"infowars.com",
|
| 121 |
+
"breitbart.com",
|
| 122 |
+
"dailystormer.com",
|
| 123 |
+
"zerohedge.com",
|
| 124 |
+
"activistpost.com",
|
| 125 |
+
"realfarmacy.com",
|
| 126 |
+
"healthnutnews.com",
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def extract_domain(url):
|
| 131 |
+
"""Extract domain from URL"""
|
| 132 |
+
ext = tldextract.extract(url)
|
| 133 |
+
return f"{ext.domain}.{ext.suffix}"
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
_PATTERNS = [
|
| 137 |
+
(re.compile(r"\b[A-Z]+\s*\(Reuters\)\s*[-ββ]?\s*", re.IGNORECASE), ""),
|
| 138 |
+
(re.compile(r"\(Reuters\)", re.IGNORECASE), ""),
|
| 139 |
+
(re.compile(r"Reuters", re.IGNORECASE), ""),
|
| 140 |
+
(
|
| 141 |
+
re.compile(
|
| 142 |
+
r"\b(?:WASHINGTON|NEW YORK|LONDON|PARIS|BERLIN|TOKYO|MOSCOW|BEIJING|DELHI)\s*[-ββ]?\s*",
|
| 143 |
+
re.IGNORECASE,
|
| 144 |
+
),
|
| 145 |
+
"",
|
| 146 |
+
),
|
| 147 |
+
(re.compile(r"\b(?:AP|CNN|BBC|Fox News|NBC|CBS|ABC News)\b", re.IGNORECASE), ""),
|
| 148 |
+
(re.compile(r"\bBy\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", re.IGNORECASE), ""),
|
| 149 |
+
(re.compile(r"\S+@\S+\.\S+"), ""),
|
| 150 |
+
(re.compile(r"http[s]?://\S+"), ""),
|
| 151 |
+
(re.compile(r"[^a-zA-Z\s]"), " "),
|
| 152 |
+
(re.compile(r"\s+"), " "),
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def remove_source_artifacts_fast(text):
|
| 157 |
+
"""Optimized version of source artifact removal"""
|
| 158 |
+
if not isinstance(text, str) or len(text) < 10:
|
| 159 |
+
return ""
|
| 160 |
+
|
| 161 |
+
for pattern, replacement in _PATTERNS:
|
| 162 |
+
text = pattern.sub(replacement, text)
|
| 163 |
+
|
| 164 |
+
return text.strip().lower()
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _process_text_chunk(text_chunk):
|
| 168 |
+
"""Internal helper to process a chunk of texts in parallel"""
|
| 169 |
+
return [remove_source_artifacts_fast(text) for text in text_chunk]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def parallel_preprocess(texts, n_jobs=None):
|
| 173 |
+
"""Parallel preprocessing of texts using multiprocessing"""
|
| 174 |
+
if n_jobs is None:
|
| 175 |
+
n_jobs = min(cpu_count(), 8)
|
| 176 |
+
|
| 177 |
+
chunk_size = max(1, len(texts) // n_jobs)
|
| 178 |
+
chunks = [texts[i : i + chunk_size] for i in range(0, len(texts), chunk_size)]
|
| 179 |
+
|
| 180 |
+
print(
|
| 181 |
+
f"Processing {len(texts)} texts in {len(chunks)} chunks using {n_jobs} processes..."
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
with Pool(n_jobs) as pool:
|
| 185 |
+
results = list(
|
| 186 |
+
tqdm(
|
| 187 |
+
pool.imap(_process_text_chunk, chunks),
|
| 188 |
+
total=len(chunks),
|
| 189 |
+
desc="Preprocessing chunks",
|
| 190 |
+
)
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
processed_texts = []
|
| 194 |
+
for chunk_result in results:
|
| 195 |
+
processed_texts.extend(chunk_result)
|
| 196 |
+
|
| 197 |
+
return processed_texts
|
deploy/utils/url_filter.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _is_pdf_or_download_url(url: str) -> bool:
|
| 5 |
+
"""Check if URL points to a PDF or download file."""
|
| 6 |
+
url_lower = url.lower()
|
| 7 |
+
|
| 8 |
+
# Check for PDF in URL path
|
| 9 |
+
if url_lower.endswith(".pdf"):
|
| 10 |
+
return True
|
| 11 |
+
|
| 12 |
+
# Check for PDF in URL path with query parameters
|
| 13 |
+
if ".pdf?" in url_lower or ".pdf#" in url_lower:
|
| 14 |
+
return True
|
| 15 |
+
|
| 16 |
+
# Check for other document/download formats
|
| 17 |
+
download_extensions = [
|
| 18 |
+
".doc",
|
| 19 |
+
".docx",
|
| 20 |
+
".xls",
|
| 21 |
+
".xlsx",
|
| 22 |
+
".ppt",
|
| 23 |
+
".pptx",
|
| 24 |
+
".zip",
|
| 25 |
+
".rar",
|
| 26 |
+
".tar",
|
| 27 |
+
".gz",
|
| 28 |
+
".7z",
|
| 29 |
+
".mp3",
|
| 30 |
+
".mp4",
|
| 31 |
+
".avi",
|
| 32 |
+
".mov",
|
| 33 |
+
".wmv",
|
| 34 |
+
".exe",
|
| 35 |
+
".msi",
|
| 36 |
+
".dmg",
|
| 37 |
+
".pkg",
|
| 38 |
+
".epub",
|
| 39 |
+
".mobi",
|
| 40 |
+
".djvu",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
for ext in download_extensions:
|
| 44 |
+
if url_lower.endswith(ext) or f"{ext}?" in url_lower or f"{ext}#" in url_lower:
|
| 45 |
+
return True
|
| 46 |
+
|
| 47 |
+
# Check for common download URL patterns
|
| 48 |
+
download_patterns = [
|
| 49 |
+
r"/download/",
|
| 50 |
+
r"/downloads/",
|
| 51 |
+
r"/attachments/",
|
| 52 |
+
r"/files/",
|
| 53 |
+
r"/uploads/",
|
| 54 |
+
r"/wp-content/uploads/",
|
| 55 |
+
r"/content/uploads/",
|
| 56 |
+
r"/assets/downloads/",
|
| 57 |
+
r"/documents/",
|
| 58 |
+
r"/pdfs/",
|
| 59 |
+
r"\.pdf$",
|
| 60 |
+
r"\.pdf\?",
|
| 61 |
+
r"\.pdf#",
|
| 62 |
+
r"attachment\.aspx",
|
| 63 |
+
r"download\.aspx",
|
| 64 |
+
r"getfile\.aspx",
|
| 65 |
+
r"viewdocument\.aspx",
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
return any(re.search(pattern, url_lower) for pattern in download_patterns)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _is_corrupted_pdf_content(content: str) -> bool:
|
| 72 |
+
"""Detect if content appears to be corrupted PDF text."""
|
| 73 |
+
if not content or len(content.strip()) < 10:
|
| 74 |
+
return False
|
| 75 |
+
|
| 76 |
+
# Common PDF corruption indicators
|
| 77 |
+
pdf_corruption_patterns = [
|
| 78 |
+
r"endstream\s+endobj",
|
| 79 |
+
r"obj\s*<[^>]*>\s*stream",
|
| 80 |
+
r"%PDF-\d+\.\d+",
|
| 81 |
+
r"xref\s+\d+",
|
| 82 |
+
r"trailer\s*<<",
|
| 83 |
+
r"startxref",
|
| 84 |
+
r"%%EOF",
|
| 85 |
+
r"stream\s+H\s+[^\w\s]{10,}", # Stream followed by garbled text
|
| 86 |
+
r"[^\w\s]{20,}", # Long sequences of non-word/space characters
|
| 87 |
+
r"obj\s+<\s*>\s*stream",
|
| 88 |
+
r"BT\s+/F\d+", # PDF text object indicators
|
| 89 |
+
r"ET\s+Q\s+q", # PDF graphics state operators
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
corruption_score = 0
|
| 93 |
+
for pattern in pdf_corruption_patterns:
|
| 94 |
+
if re.search(pattern, content, re.IGNORECASE):
|
| 95 |
+
corruption_score += 1
|
| 96 |
+
|
| 97 |
+
# Check character distribution - PDFs often have weird character distributions
|
| 98 |
+
if len(content) > 50:
|
| 99 |
+
# Count non-printable or unusual characters
|
| 100 |
+
unusual_chars = sum(
|
| 101 |
+
1 for c in content if ord(c) > 127 or (ord(c) < 32 and c not in "\t\n\r ")
|
| 102 |
+
)
|
| 103 |
+
unusual_ratio = unusual_chars / len(content)
|
| 104 |
+
|
| 105 |
+
if unusual_ratio > 0.3: # More than 30% unusual characters
|
| 106 |
+
corruption_score += 2
|
| 107 |
+
|
| 108 |
+
# Check for excessive special characters in a row
|
| 109 |
+
if re.search(r"[^\w\s]{15,}", content):
|
| 110 |
+
corruption_score += 1
|
| 111 |
+
|
| 112 |
+
# Check for PDF-specific garbled patterns
|
| 113 |
+
if re.search(r"[A-Za-z0-9]{2,}\s+[^\w\s]{5,}\s+[A-Za-z0-9]{2,}", content):
|
| 114 |
+
corruption_score += 1
|
| 115 |
+
|
| 116 |
+
return corruption_score >= 2
|
models/clickbait/feature_info.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:053e642a1e6692cd3ca116bb36aca8aab7c65f45ac07ccd75babd638debf07e3
|
| 3 |
+
size 1126
|
models/clickbait/logistic_regression_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c03ee9383f2dd1fe51d335adbc292975fd051202d981cdff0805a16941b6f80
|
| 3 |
+
size 40845
|
models/clickbait/tfidf_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1e7a97703602d9eafb046d14e9dd776fce4dbc050cf7ca201fedbc4dd31b13c
|
| 3 |
+
size 186468
|
requirements.txt
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
sentence-transformers==4.1.0
|
| 2 |
torch==2.7.1
|
|
|
|
| 3 |
textblob==0.19.0
|
|
|
|
| 1 |
+
beautifulsoup4==4.13.4
|
| 2 |
+
googlesearch-python==1.3.0
|
| 3 |
+
numpy==2.0.2
|
| 4 |
+
requests==2.32.3
|
| 5 |
+
tldextract==5.3.0
|
| 6 |
+
tqdm==4.67.1
|
| 7 |
+
newspaper3k
|
| 8 |
+
lxml_html_clean
|
| 9 |
+
nltk==3.9.1
|
| 10 |
sentence-transformers==4.1.0
|
| 11 |
torch==2.7.1
|
| 12 |
+
scikit-learn==1.6.1
|
| 13 |
textblob==0.19.0
|
semantic_similarity.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
| 4 |
+
from typing import List
|
| 5 |
+
import torch
|
| 6 |
+
from sentence_transformers import SentenceTransformer, util
|
| 7 |
+
from textblob import TextBlob
|
| 8 |
+
|
| 9 |
+
model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
|
| 10 |
+
model.eval()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def calculate_semantic_similarity(
|
| 14 |
+
claim: str, sentences: List[str], similarity_threshold: float = 0.4
|
| 15 |
+
) -> float:
|
| 16 |
+
"""
|
| 17 |
+
Calculates a weighted score representing how well a list of sentences supports a claim.
|
| 18 |
+
Args:
|
| 19 |
+
claim (str): The claim to be verified.
|
| 20 |
+
sentences (List[str]): A list of sentences to check against the claim.
|
| 21 |
+
similarity_threshold (float, optional): The minimum similarity score for a
|
| 22 |
+
sentence to be considered "supporting". Defaults to 0.5.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
float: A weighted score between 0.0 and 1.0.
|
| 26 |
+
"""
|
| 27 |
+
if not sentences:
|
| 28 |
+
return 0.0
|
| 29 |
+
|
| 30 |
+
all_scores = []
|
| 31 |
+
|
| 32 |
+
with torch.no_grad():
|
| 33 |
+
claim_embedding = model.encode(claim, show_progress_bar=False)
|
| 34 |
+
sentence_embeddings = model.encode(sentences, show_progress_bar=False)
|
| 35 |
+
cosine_scores = util.cos_sim(claim_embedding, sentence_embeddings)[0]
|
| 36 |
+
claim_sentiment = TextBlob(claim).sentiment.polarity
|
| 37 |
+
|
| 38 |
+
for i, sentence in enumerate(sentences):
|
| 39 |
+
similarity = cosine_scores[i].item()
|
| 40 |
+
sentence_sentiment = TextBlob(sentence).sentiment.polarity
|
| 41 |
+
|
| 42 |
+
if claim_sentiment * sentence_sentiment > 0:
|
| 43 |
+
similarity *= 1.1
|
| 44 |
+
elif claim_sentiment * sentence_sentiment < 0:
|
| 45 |
+
similarity *= 0.9
|
| 46 |
+
|
| 47 |
+
# print(f"Sentence: {sentence}\nSimilarity: {similarity:.2f}\n")
|
| 48 |
+
similarity = max(0.0, min(1.0, similarity))
|
| 49 |
+
all_scores.append(similarity)
|
| 50 |
+
|
| 51 |
+
supporting_scores = [s for s in all_scores if s >= similarity_threshold]
|
| 52 |
+
proportion_supporting = len(supporting_scores) / len(sentences)
|
| 53 |
+
|
| 54 |
+
if proportion_supporting >= 0.30:
|
| 55 |
+
final_score = sum(supporting_scores) / len(supporting_scores)
|
| 56 |
+
else:
|
| 57 |
+
average_all_scores = sum(all_scores) / len(all_scores)
|
| 58 |
+
# penalty = 0.80 # 20% reduction
|
| 59 |
+
final_score = average_all_scores # * penalty
|
| 60 |
+
|
| 61 |
+
return final_score
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
if __name__ == "__main__":
|
| 65 |
+
while True:
|
| 66 |
+
claim_to_verify = input("Enter claim to verify: ")
|
| 67 |
+
evidence = input("Enter evidence sentences: ")
|
| 68 |
+
evidence_sentences = [
|
| 69 |
+
"The recent legislation is projected to stimulate significant economic growth.", # High similarity
|
| 70 |
+
"Market analysts are optimistic about the financial future following the announcement.", # High similarity
|
| 71 |
+
"However, some critics argue that the policy might lead to unforeseen inflation.", # Low similarity
|
| 72 |
+
"The stock market reacted positively, showing a slight increase.", # Medium similarity
|
| 73 |
+
"This is considered a poor decision for the nation's financial stability by some experts.", # Opposing sentiment
|
| 74 |
+
"The primary goal of the initiative is to create jobs and encourage consumer spending.", # High similarity
|
| 75 |
+
"Unemployment rates are expected to decline in the coming months.", # High similarity
|
| 76 |
+
"There has been some public disapproval regarding the policy's rollout.", # Low similarity
|
| 77 |
+
"This will surely lead to a stronger and more resilient economy.", # High similarity
|
| 78 |
+
"Financial experts have voiced concerns about the potential long-term consequences.", # Opposing sentiment
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
final_score = calculate_semantic_similarity(claim_to_verify, [evidence.strip()])
|
| 82 |
+
|
| 83 |
+
print(f"The final weighted support score for the claim is: {final_score:.4f}")
|
| 84 |
+
|
| 85 |
+
if final_score > 0.65:
|
| 86 |
+
print("Interpretation: The claim is strongly supported by the evidence. β
")
|
| 87 |
+
elif final_score > 0.4:
|
| 88 |
+
print(
|
| 89 |
+
"Interpretation: The claim has moderate support from the evidence. π€"
|
| 90 |
+
)
|
| 91 |
+
else:
|
| 92 |
+
print("Interpretation: The claim has weak support from the evidence. β")
|