Spaces:

aubynsamuel05
/

nli_checks

Running

App Files Files Community

aubynsamuel05 commited on Jul 15

Commit

f078461

1 Parent(s): b372940

Fake new dectector with Gradio interface

Browse files

Files changed (19) hide show

.gitattributes +5 -0
app.py +8 -55
deploy/__init__.py +0 -0
deploy/index.py +361 -0
deploy/main/__init__.py +0 -0
deploy/main/claim_verifier.py +371 -0
deploy/main/network_analyzer.py +259 -0
deploy/main/predict_clickbait.py +43 -0
deploy/main/source_credibility_analyzer.py +192 -0
deploy/utils/__init__.py +0 -0
deploy/utils/clickbait_utils.py +145 -0
deploy/utils/content_extractor.py +140 -0
deploy/utils/general_utils.py +197 -0
deploy/utils/url_filter.py +116 -0
models/clickbait/feature_info.pkl +3 -0
models/clickbait/logistic_regression_model.pkl +3 -0
models/clickbait/tfidf_vectorizer.pkl +3 -0
requirements.txt +10 -0
semantic_similarity.py +92 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+.vscode
+.venv
+venv
+**/__pycache__/
+snli_1.0_dev.jsonl

app.py CHANGED Viewed

@@ -1,67 +1,20 @@
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-from typing import List
-import torch
-from sentence_transformers import SentenceTransformer, util
-from textblob import TextBlob
 import gradio as gr
-model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
-model.eval()
-def calculate_semantic_similarity(
-    claim: str, sentences_input: str, similarity_threshold: float = 0.4
-) -> float:
-    """
-    Accepts a claim and newline-separated sentences. Returns a weighted similarity score.
-    """
-    sentences = [s.strip() for s in sentences_input.split("\n") if s.strip()]
-    if not sentences:
-        return 0.0
-    all_scores = []
-    with torch.no_grad():
-        claim_embedding = model.encode(claim, show_progress_bar=False)
-        sentence_embeddings = model.encode(sentences, show_progress_bar=False)
-        cosine_scores = util.cos_sim(claim_embedding, sentence_embeddings)[0]
-        claim_sentiment = TextBlob(claim).sentiment.polarity
-        for i, sentence in enumerate(sentences):
-            similarity = cosine_scores[i].item()
-            sentence_sentiment = TextBlob(sentence).sentiment.polarity
-            if claim_sentiment * sentence_sentiment > 0:
-                similarity *= 1.1
-            elif claim_sentiment * sentence_sentiment < 0:
-                similarity *= 0.9
-            similarity = max(0.0, min(1.0, similarity))
-            all_scores.append(similarity)
-    supporting_scores = [s for s in all_scores if s >= similarity_threshold]
-    proportion_supporting = len(supporting_scores) / len(sentences)
-    if proportion_supporting >= 0.30:
-        final_score = sum(supporting_scores) / len(supporting_scores)
-    else:
-        final_score = sum(all_scores) / len(all_scores)
-    return round(final_score, 4)
 iface = gr.Interface(
-    fn=calculate_semantic_similarity,
     inputs=[
-        gr.Textbox(label="Claim"),
-        gr.Textbox(lines=10, label="Evidence Sentences (one per line)"),
-        gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.4, label="Similarity Threshold")
     ],
-    outputs=gr.Number(label="Final Weighted Support Score"),
-    title="Claim Support Checker",
-    description="Input a claim and evidence sentences to calculate how strongly the evidence supports the claim."
 )
 iface.launch()

 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 import gradio as gr
+from deploy.index import FakeNewsDetector
+detector = FakeNewsDetector()
 iface = gr.Interface(
+    fn=detector.comprehensive_verify,
     inputs=[
+        gr.Textbox(label="Headline"),
     ],
+    outputs=gr.JSON(label="Analysis Result"),  # JSON output for structured verdict
+    title="Fake News Detector",
+    description="Input a headline to check how credible it is.",
 )
 iface.launch()

deploy/__init__.py ADDED Viewed

File without changes

deploy/index.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+import gc
+import time
+import random
+from datetime import datetime
+from typing import Dict, List, Tuple, Any
+import numpy as np
+from googlesearch import search
+from deploy.main.claim_verifier import ClaimVerifier
+from deploy.main.network_analyzer import NetworkAnalyzer
+from deploy.main.source_credibility_analyzer import SourceCredibilityAnalyzer
+from deploy.utils.general_utils import extract_domain
+from deploy.main.predict_clickbait import ClickbaitPredictor
+import nltk
+try:
+    nltk.data.find("tokenizers/punkt")
+    nltk.data.find("tokenizers/punkt_tab")
+except LookupError:
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+class FakeNewsDetector:
+    """Main enhanced fact checker with ML integration"""
+    def __init__(self):
+        try:
+            self.source_analyzer = SourceCredibilityAnalyzer()
+            self.claim_verifier = ClaimVerifier()
+            self.network_analyzer = NetworkAnalyzer()
+            self.clickbait_predictor = ClickbaitPredictor()
+        except Exception as e:
+            print(f"❌ Error initializing components: {e}")
+            raise
+    def _to_float(self, value: Any, default: float = 0.0) -> float:
+        """Safely convert any numeric value to Python float"""
+        try:
+            if isinstance(value, (np.integer, np.floating)):
+                return float(value)
+            elif isinstance(value, (int, float)):
+                return float(value)
+            else:
+                return default
+        except (ValueError, TypeError):
+            return default
+    def _analyze_clickbait(self, headline: str) -> float:
+        """Analyzes the headline for clickbait characteristics."""
+        print("🧠 ML Clickbait Analysis...")
+        try:
+            _, clickbait_score, _ = self.clickbait_predictor.predict(headline)
+            clickbait_score = self._to_float(clickbait_score, 0.5)
+            print(f"   Clickbait Score: {clickbait_score:.2f}")
+            return clickbait_score
+        except Exception as e:
+            print(f"   ❌ Clickbait analysis error: {e}")
+            return 0.5  # Default moderate score
+    def _search_for_sources(self, headline: str, num_results: int) -> List[str]:
+        """Searches the web for sources related to the headline."""
+        # print("🔎 Searching and analyzing sources...")
+        try:
+            time.sleep(random.uniform(1.5, 3.0))
+            search_results = list(search(headline, num_results=num_results, lang="en"))
+            # print(f"   Found {len(search_results)} search results")
+            return search_results
+        except Exception as e:
+            print(f"   ❌ Search error: {e}")
+            return []
+    def _analyze_source_credibility(
+        self, search_results: List[str]
+    ) -> Tuple[float, int, int]:
+        """Analyzes the credibility of the found source domains."""
+        print("📊 Analyzing source credibility...")
+        if not search_results:
+            print("   ❌ No search results to analyze")
+            return 0.1, 0, 0
+        source_scores = []
+        trusted_count = 0
+        suspicious_count = 0
+        for i, url in enumerate(search_results):
+            try:
+                domain = extract_domain(url)
+                credibility_score = self.source_analyzer.analyze_domain_credibility(
+                    domain
+                )
+                credibility_score = self._to_float(credibility_score, 0.5)
+                source_scores.append(credibility_score)
+                if credibility_score > 0.7:
+                    trusted_count += 1
+                    print(f"   {i+1}. {domain} ✅ ({credibility_score:.2f})")
+                elif credibility_score < 0.3:
+                    suspicious_count += 1
+                    print(f"   {i+1}. {domain} ❌ ({credibility_score:.2f})")
+                else:
+                    print(f"   {i+1}. {domain} ❓ ({credibility_score:.2f})")
+            except Exception as e:
+                print(f"   ❌ Error analyzing {url}: {e}")
+                source_scores.append(0.3)  # Default neutral score
+        # Use regular Python mean instead of np.mean
+        avg_credibility = (
+            sum(source_scores) / len(source_scores) if source_scores else 0.1
+        )
+        return avg_credibility, trusted_count, suspicious_count
+    def _analyze_network_propagation(
+        self, search_results: List[str]
+    ) -> Dict[str, float]:
+        """Analyzes the propagation pattern of the news across the network."""
+        print("🌐 Network Propagation Analysis...")
+        if not search_results:
+            print("   ❌ No search results for network analysis")
+            return {"score": 0.1, "domain_diversity": 0.0}
+        try:
+            network_analysis = self.network_analyzer.analyze_propagation_pattern(
+                search_results
+            )
+            # Convert all values to Python floats
+            result = {
+                "score": self._to_float(network_analysis.get("score", 0.1)),
+                "domain_diversity": self._to_float(
+                    network_analysis.get("domain_diversity", 0.0)
+                ),
+            }
+            print(f"   Propagation Score: {result['score']:.2f}")
+            print(f"   Domain Diversity: {result['domain_diversity']:.2f}")
+            return result
+        except Exception as e:
+            print(f"   ❌ Network analysis error: {e}")
+            return {"score": 0.1, "domain_diversity": 0.0}
+    def _verify_claim(self, headline: str, search_results: List[str]) -> float:
+        """Verifies the claim against the content of the found sources."""
+        print("✅ Verifying Claims...")
+        if not search_results:
+            print("   ❌ No search results for claim verification")
+            return 0.4
+        try:
+            verification = self.claim_verifier.verify_claim_against_sources(
+                headline, search_results
+            )
+            claim_verification_score = self._to_float(verification.get("score", 0.4))
+            print(f"   '{headline}': {claim_verification_score:.2f}")
+            return claim_verification_score
+        except Exception as e:
+            print(f"   ❌ Claim verification error: {e}")
+            return 0.4
+    def _calculate_final_score_and_verdict(
+        self, component_scores: Dict[str, float]
+    ) -> Tuple[float, str, str]:
+        """Calculates the final weighted score and determines the verdict."""
+        weights = {
+            "source_credibility": 0.35,
+            "claim_verification": 0.35,
+            "network_propagation": 0.20,
+            "clickbait_detection": 0.10,
+        }
+        final_score = sum(
+            component_scores.get(component, 0.0) * weight
+            for component, weight in weights.items()
+        )
+        if final_score >= 0.75:
+            verdict = "Credible — Backed by Evidence"
+            confidence = "Very High"
+        elif final_score >= 0.60:
+            verdict = "Likely True — Supported by Sources"
+            confidence = "High"
+        elif final_score >= 0.45:
+            verdict = "Unclear — Conflicting Information"
+            confidence = "Moderate"
+        elif final_score >= 0.30:
+            verdict = "Doubtful — Weak or Biased Evidence"
+            confidence = "Low"
+        else:
+            verdict = "False or Misleading — No Basis Found"
+            confidence = "Very Low"
+        return final_score, verdict, confidence
+    def _print_summary(self, results: Dict):
+        """Prints a formatted summary of the analysis results."""
+        final_verdict = results["final_verdict"]
+        components = results["components"]
+        print(f"📈 COMPREHENSIVE ANALYSIS RESULTS:")
+        print(
+            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+        )
+        print(f"🎯 Final Score: {final_verdict['score']:.2f}/1.000")
+        print(f"🏆 Verdict: {final_verdict['verdict']}")
+        print(f"📊 Confidence: {final_verdict['confidence']}")
+        print(f"🔍 Component Breakdown:")
+        for component, score in final_verdict["components"].items():
+            print(f"   • {component.replace('_', ' ').title()}: {score:.2f}")
+        print(f"📋 Summary:")
+        print(
+            f"   • Trusted Sources: {components['source_credibility']['trusted_count']}"
+        )
+        print(
+            f"   • Suspicious Sources: {components['source_credibility']['suspicious_count']}"
+        )
+        print(
+            f"   • Clickbait Score: {components['clickbait']['score']:.2f} (lower is better)"
+        )
+        print(f"   • Domain Diversity: {components['network']['domain_diversity']:.2f}")
+    def comprehensive_verify(
+        self, raw_headline: str, results_to_check: int = 8
+    ) -> Dict:
+        """
+        Comprehensive fact-checking with ML integration.
+        This method orchestrates the analysis by calling various specialized components.
+        """
+        print(f'\n🔎 Comprehensive Analysis: "{raw_headline}"')
+        print("=" * 80)
+        if not raw_headline or not raw_headline.strip():
+            print("❌ Empty or invalid headline provided")
+            return {
+                "headline": "",
+                "timestamp": datetime.now().isoformat(),
+                "final_verdict": {
+                    "verdict": "❌ Invalid Input",
+                    "confidence": "Very High",
+                    "score": 0.0,
+                    "components": {
+                        "claim_verification": 0.0,
+                        "source_credibility": 0.0,
+                        "clickbait_detection": 0.0,
+                        "network_propagation": 0.0,
+                    },
+                },
+                "components": {
+                    "clickbait": {"score": 0.0},
+                    "source_credibility": {
+                        "score": 0.0,
+                        "trusted_count": 0,
+                        "suspicious_count": 0,
+                    },
+                    "network": {"score": 0.0, "domain_diversity": 0.0},
+                    "claim_verification": {"score": 0.0},
+                },
+            }
+        # Step 1: Search for sources
+        search_results = self._search_for_sources(raw_headline, results_to_check)
+        if not search_results:
+            print("⚠️ No search results found. Assigning low credibility by default.")
+            return {
+                "headline": raw_headline,
+                "timestamp": datetime.now().isoformat(),
+                "final_verdict": {
+                    "verdict": "🚫 HIGHLY QUESTIONABLE",
+                    "confidence": "Very High",
+                    "score": 0.1,
+                    "components": {
+                        "claim_verification": 0.1,
+                        "source_credibility": 0.1,
+                        "clickbait_detection": 0.1,
+                        "network_propagation": 0.1,
+                    },
+                },
+                "components": {
+                    "clickbait": {"score": 0.5},
+                    "source_credibility": {
+                        "score": 0.1,
+                        "trusted_count": 0,
+                        "suspicious_count": 0,
+                    },
+                    "network": {"score": 0.1, "domain_diversity": 0.0},
+                    "claim_verification": {"score": 0.1},
+                },
+            }
+        # Step 2: Run all analysis components
+        clickbait_score = self._analyze_clickbait(raw_headline)
+        avg_source_credibility, trusted_count, suspicious_count = (
+            self._analyze_source_credibility(search_results)
+        )
+        network_analysis = self._analyze_network_propagation(search_results)
+        claim_verification_score = self._verify_claim(raw_headline, search_results)
+        # Step 3: Consolidate component scores (ensure all are Python floats)
+        component_scores = {
+            "claim_verification": claim_verification_score,
+            "source_credibility": avg_source_credibility,
+            "clickbait_detection": 1.0 - clickbait_score,  # Invert score
+            "network_propagation": network_analysis["score"],
+        }
+        # Step 4: Calculate final score and verdict
+        final_score, verdict, confidence = self._calculate_final_score_and_verdict(
+            component_scores
+        )
+        # Step 5: Build the exact JSON structure you specified
+        analysis_results = {
+            "headline": raw_headline,
+            "timestamp": datetime.now().isoformat(),
+            "final_verdict": {
+                "verdict": verdict,
+                "confidence": confidence,
+                "score": round(final_score, 2),
+                "components": {
+                    "claim_verification": round(
+                        component_scores["claim_verification"], 2
+                    ),
+                    "source_credibility": round(
+                        component_scores["source_credibility"], 2
+                    ),
+                    "clickbait_detection": round(
+                        component_scores["clickbait_detection"], 2
+                    ),
+                    "network_propagation": round(
+                        component_scores["network_propagation"], 2
+                    ),
+                },
+            },
+            "components": {
+                "clickbait": {"score": round(clickbait_score, 2)},
+                "source_credibility": {
+                    "score": round(avg_source_credibility, 2),
+                    "trusted_count": trusted_count,
+                    "suspicious_count": suspicious_count,
+                },
+                "network": {
+                    "score": round(network_analysis["score"], 2),
+                    "domain_diversity": round(network_analysis["domain_diversity"], 2),
+                },
+                "claim_verification": {"score": round(claim_verification_score, 2)},
+            },
+        }
+        # self._print_summary(analysis_results)
+        gc.collect()
+        return analysis_results

deploy/main/__init__.py ADDED Viewed

File without changes

deploy/main/claim_verifier.py ADDED Viewed

	@@ -0,0 +1,371 @@

+from typing import List, Dict, Optional, Tuple
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import hashlib
+from urllib.parse import urlparse
+import warnings
+import re
+from nltk.tokenize import sent_tokenize
+import string
+from deploy.utils.general_utils import TRUSTED_DOMAINS, SUSPICIOUS_DOMAINS
+from deploy.utils.content_extractor import extract_content
+from deploy.utils.url_filter import _is_corrupted_pdf_content, _is_pdf_or_download_url
+from semantic_similarity import calculate_semantic_similarity
+warnings.filterwarnings("ignore")
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+class ClaimVerifier:
+    """Enhanced claim verifier with smart sentence extraction and prioritized scraping."""
+    def __init__(self, cache_size: int = 500, max_workers: int = 4):
+        self.claim_cache: Dict[str, Dict] = {}
+        self.content_cache: Dict[str, str] = {}
+        self.cache_size = cache_size
+        self.max_workers = max_workers
+        self.trusted_domains = TRUSTED_DOMAINS
+        self.suspicious_domains = SUSPICIOUS_DOMAINS
+        self.domain_weights = {"trusted": 2.0, "suspicious": 0.3, "neutral": 1.0}
+        self.user_agents = [
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.196 Mobile Safari/537.36",
+            "Mozilla/5.0 (iPad; CPU OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/18.18363",
+            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
+        ]
+        self.current_ua_index = 0
+        self.timeout = 10
+    def _get_domain_weight(self, url: str) -> Tuple[float, str]:
+        domain = urlparse(url).netloc.lower().replace("www.", "")
+        if domain in self.trusted_domains:
+            return self.domain_weights["trusted"], "trusted"
+        elif domain in self.suspicious_domains:
+            return self.domain_weights["suspicious"], "suspicious"
+        else:
+            return self.domain_weights["neutral"], "neutral"
+    def _prioritize_sources(self, search_results: List[str]) -> List[str]:
+        """Prioritize trusted sources and filter out PDFs/downloads."""
+        # First, filter out PDFs and download links
+        filtered_results = []
+        pdf_count = 0
+        for url in search_results:
+            if _is_pdf_or_download_url(url):
+                pdf_count += 1
+                logging.info(f"📄 Filtered out PDF/download URL: {url}")
+                continue
+            filtered_results.append(url)
+        if pdf_count > 0:
+            logging.info(f"🚫 Filtered out {pdf_count} PDF/download URLs")
+        if not filtered_results:
+            logging.warning("⚠️ No valid URLs remaining after filtering PDFs/downloads")
+            return []
+        # Then prioritize trusted sources
+        trusted_sources = [
+            url
+            for url in filtered_results
+            if self._get_domain_weight(url)[1] == "trusted"
+        ]
+        other_sources = [
+            url
+            for url in filtered_results
+            if self._get_domain_weight(url)[1] != "trusted"
+        ]
+        if len(trusted_sources) >= 4:
+            return trusted_sources[:8]
+        else:
+            return (trusted_sources + other_sources)[:8]
+    def _is_valid_sentence(self, sentence: str) -> bool:
+        """Enhanced sentence validation to filter out garbled/corrupted text."""
+        sentence = sentence.strip()
+        # Basic length check
+        if len(sentence) < 20 or len(sentence) > 300:
+            return False
+        # Check for too many non-ASCII characters (garbled text indicator)
+        non_ascii_count = sum(1 for c in sentence if ord(c) > 127)
+        if non_ascii_count > len(sentence) * 0.3:  # More than 30% non-ASCII
+            return False
+        # Check for excessive special characters or symbols
+        special_chars = sum(
+            1 for c in sentence if c in string.punctuation and c not in ".,!?;:"
+        )
+        if special_chars > len(sentence) * 0.2:  # More than 20% special chars
+            return False
+        # Enhanced check for random character patterns (PDF corruption indicators)
+        if re.search(r"[^\w\s]{3,}", sentence):  # 3+ consecutive non-word chars
+            return False
+        # Check for PDF-specific corruption patterns
+        if re.search(r"(endstream|endobj|obj\s*<|stream\s+H)", sentence, re.IGNORECASE):
+            return False
+        # Check for excessive whitespace or control characters
+        if re.search(r"\s{3,}", sentence) or any(
+            ord(c) < 32 and c not in "\t\n\r" for c in sentence
+        ):
+            return False
+        # Check for minimum word count and average word length
+        words = sentence.split()
+        if len(words) < 4:
+            return False
+        # Check for reasonable word lengths (avoid strings like "a b c d e f g")
+        avg_word_length = sum(len(word) for word in words) / len(words)
+        if avg_word_length < 2.5:
+            return False
+        # Check for excessive capitalization
+        if sum(1 for c in sentence if c.isupper()) > len(sentence) * 0.5:
+            return False
+        # Check for sequences that look like corrupted encoding
+        if re.search(r"[^\w\s]{5,}", sentence):
+            return False
+        return True
+    def _is_noise_sentence(self, sentence: str) -> bool:
+        """Check if a sentence is likely noise (navigation, ads, etc.)."""
+        noise_patterns = [
+            r"^(click|tap|read|view|see|watch|follow|subscribe)",
+            r"(cookie|privacy|terms|conditions|policy)",
+            r"(advertisement|sponsored|ad)",
+            r"(©|copyright|\u00a9)",
+            r"^(home|about|contact|menu|search)",
+            r"(javascript|enable|browser|update)",
+            r"^[\W\d\s]*$",
+            r"(share|like|comment|subscribe)",
+            r"(login|sign\s+in|register)",
+            r"(loading|please\s+wait)",
+            # Add PDF-specific noise patterns
+            r"(pdf|download|file|document)\s*(viewer|reader)",
+            r"(page|pages)\s*\d+\s*(of|\/)\s*\d+",
+            r"(adobe|acrobat|reader)",
+        ]
+        sentence_lower = sentence.lower()
+        return any(re.search(pattern, sentence_lower) for pattern in noise_patterns)
+    def _extract_relevant_sentences(self, content: str) -> List[str]:
+        """Extract relevant sentences using TF-IDF vectorization."""
+        if not content or len(content.strip()) < 50:
+            return []
+        # Check if content appears to be corrupted PDF
+        if _is_corrupted_pdf_content(content):
+            logging.warning("🚫 Content appears to be corrupted PDF - skipping")
+            return []
+        sentences = sent_tokenize(content)
+        # Enhanced filtering pipeline
+        valid_sentences = []
+        for sentence in sentences:
+            if self._is_valid_sentence(sentence) and not self._is_noise_sentence(
+                sentence
+            ):
+                valid_sentences.append(sentence.strip())
+        if not valid_sentences:
+            logging.warning("No valid sentences found after filtering")
+            return []
+        return valid_sentences
+    def _get_user_agent(self) -> str:
+        ua = self.user_agents[self.current_ua_index]
+        self.current_ua_index = (self.current_ua_index + 1) % len(self.user_agents)
+        return ua
+    def _cache_key(self, text: str) -> str:
+        return hashlib.md5(text.encode()).hexdigest()
+    def _add_to_cache(self, key: str, result: Dict):
+        if len(self.claim_cache) >= self.cache_size:
+            oldest_key = next(iter(self.claim_cache))
+            del self.claim_cache[oldest_key]
+        self.claim_cache[key] = result
+    def _get_from_cache(self, key: str) -> Optional[Dict]:
+        return self.claim_cache.get(key)
+    def _semantic_similarity_with_sentences(
+        self, claim: str, sentences: List[str]
+    ) -> float:
+        """Calculate entailment scores and return the best one."""
+        try:
+            score = calculate_semantic_similarity(claim, sentences)
+        except Exception as e:
+            logging.error(f"Error analyzing sentence: {e}")
+        return score
+    def verify_claim_against_sources(
+        self, claim: str, search_results: List[str]
+    ) -> Dict:
+        logging.info(f"\nVerifying Claim: '{claim}'...")
+        cache_key = self._cache_key(f"verify_{claim}")
+        if cached_result := self._get_from_cache(cache_key):
+            logging.info("📋 Using cached result")
+            return cached_result
+        prioritized_sources = self._prioritize_sources(search_results)
+        if not prioritized_sources:
+            logging.warning("⚠️ No valid sources available after filtering")
+            return {
+                "score": 0.3,
+                "total_sources_processed": 0,
+                "support_sum": 0.0,
+                "total_weight": 0.0,
+                "source_details": [],
+                "warning": "No valid sources available after filtering PDFs/downloads",
+            }
+        support_scores = []
+        total_weight = 0.0
+        source_details = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            future_to_url = {
+                executor.submit(self._analyze_url, url, claim): url
+                for url in prioritized_sources
+            }
+            try:
+                for future in as_completed(future_to_url, timeout=45):
+                    url = future_to_url[future]
+                    try:
+                        if result := future.result(timeout=15):
+                            similarity_score, domain_weight, domain_type, sentences = (
+                                result
+                            )
+                            # Enhanced Logging Format
+                            logging.info(f"\nSource: {url} ({domain_type})")
+                            # logging.info(
+                            #     f"  - Relevant Sentences: {sentences[:3]}"
+                            # )  # Log first 2 sentences
+                            logging.info(
+                                f"  - Entailment Score: {similarity_score:.2f}"
+                            )
+                            total_weight += domain_weight
+                            if similarity_score >= 0.4:
+                                support_scores.append(similarity_score * domain_weight)
+                            source_details.append(
+                                {
+                                    "url": url,
+                                    "semantic_similarity": similarity_score,
+                                    "domain_weight": domain_weight,
+                                    "domain_type": domain_type,
+                                    "relevant_sentences": sentences[:3],
+                                }
+                            )
+                    except Exception as e:
+                        logging.error(f"Error processing {url}: {e}")
+            except TimeoutError:
+                logging.warning("⏰ Timeout: Some URLs were skipped.")
+        support_sum = sum(support_scores)
+        if total_weight > 0:
+            final_score = min(1.0, support_sum / len(support_scores))
+            # Adjustments
+            # if final_score < 0.5 and support_sum < 0.5:
+            #     final_score *= 0.8
+            # elif final_score > 0.5 and support_sum >= 1.0:
+            #     final_score = min(0.9, final_score * 1.1)
+        else:
+            final_score = 0.1
+        final_score = max(0.0, min(1.0, final_score))
+        logging.info(
+            f"\n{'='*20}\n🏁 Final Verification Score: {final_score:.2f}\n{'='*20}"
+        )
+        result = {
+            "score": final_score,
+            "total_sources_processed": len(source_details),
+            "support_sum": support_sum,
+            "total_weight": total_weight,
+            "source_details": source_details,
+        }
+        self._add_to_cache(cache_key, result)
+        return result
+    def _analyze_url(
+        self, url: str, claim: str
+    ) -> Optional[Tuple[float, float, str, List[str]]]:
+        try:
+            # Double-check for PDFs at analysis time (in case some slipped through)
+            if _is_pdf_or_download_url(url):
+                logging.info(f"🚫 Skipping PDF/download URL at analysis time: {url}")
+                return None
+            cache_key = self._cache_key(url)
+            content = extract_content(
+                url,
+                self.content_cache,
+                cache_key,
+                self._get_user_agent,
+                self.timeout,
+                self.cache_size,
+            )
+            if not content or len(content.strip()) < 50:
+                return None
+            # Check for corrupted PDF content
+            if _is_corrupted_pdf_content(content):
+                logging.warning(f"🚫 Skipping corrupted PDF content from: {url}")
+                return None
+            # Used for sentence extraction instead of embeddings
+            relevant_sentences = self._extract_relevant_sentences(content)
+            if not relevant_sentences:
+                return None
+            # cleaned_content = ""
+            # for sentence in relevant_sentences:
+            #     if (
+            #         sentence.endswith(".")
+            #         or sentence.endswith("?")
+            #         or sentence.endswith("!")
+            #     ):
+            #         cleaned_content += f"{sentence} "
+            #     else:
+            #         cleaned_content += f"{sentence}. "
+            semantic_similarity = self._semantic_similarity_with_sentences(
+                claim, relevant_sentences
+            )
+            domain_weight, domain_type = self._get_domain_weight(url)
+            # print(f"relevant_sentences: {cleaned_content}")
+            return semantic_similarity, domain_weight, domain_type, relevant_sentences
+        except Exception as e:
+            logging.error(f"Failed to analyze URL {url}: {e}")
+            return None

deploy/main/network_analyzer.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import math
+from collections import Counter
+from typing import List, Dict
+from deploy.utils.general_utils import (
+    TRUSTED_DOMAINS,
+    SUSPICIOUS_DOMAINS,
+    extract_domain,
+)
+SOCIAL_AGGREGATOR_DOMAINS = {
+    "facebook.com",
+    "twitter.com",
+    "reddit.com",
+    "youtube.com",
+    "instagram.com",
+    "tiktok.com",
+    "google.com",
+    "yahoo.com",
+    "msn.com",
+    "aol.com",
+    "linkedin.com",
+    "pinterest.com",
+    "snapchat.com",
+    "discord.com",
+    "telegram.org",
+}
+CONTENT_FARM_DOMAINS = {
+    "buzzfeed.com",
+    "clickhole.com",
+    "upworthy.com",
+    "viralthread.com",
+    "shareably.net",
+    "littlethings.com",
+    "providr.com",
+    "shared.com",
+}
+class NetworkAnalyzer:
+    """Propagation pattern analyzer - returns only score, and domain_diversity"""
+    def __init__(self):
+        # Scoring weights
+        self.weights = {"domain_credibility": 0.60, "diversity_quality": 0.40}
+        self.min_sources_threshold = 3
+        self.min_unique_domains = 2
+    def _calculate_domain_credibility_score(self, domains: List[str]) -> float:
+        """Calculate domain credibility score"""
+        if not domains:
+            return 0.0
+        domain_counts = Counter(domains)
+        total_sources = len(domains)
+        # Categorize domains
+        trusted_count = sum(
+            count
+            for domain, count in domain_counts.items()
+            if domain in TRUSTED_DOMAINS
+        )
+        suspicious_count = sum(
+            count
+            for domain, count in domain_counts.items()
+            if domain in SUSPICIOUS_DOMAINS
+        )
+        social_count = sum(
+            count
+            for domain, count in domain_counts.items()
+            if domain in SOCIAL_AGGREGATOR_DOMAINS
+        )
+        content_farm_count = sum(
+            count
+            for domain, count in domain_counts.items()
+            if domain in CONTENT_FARM_DOMAINS
+        )
+        # Calculate ratios
+        trusted_ratio = trusted_count / total_sources
+        suspicious_ratio = suspicious_count / total_sources
+        social_ratio = social_count / total_sources
+        content_farm_ratio = content_farm_count / total_sources
+        unknown_ratio = 1 - (
+            trusted_ratio + suspicious_ratio + social_ratio + content_farm_ratio
+        )
+        # Calculate score
+        base_score = 0.15
+        score = base_score
+        score += trusted_ratio * 0.6
+        score -= suspicious_ratio * 0.8
+        score -= content_farm_ratio * 0.4
+        score += social_ratio * 0.1
+        score -= unknown_ratio * 0.2
+        # Additional penalties
+        if suspicious_ratio > 0.5:
+            score -= 0.3
+        if trusted_count == 0 and total_sources > 5:
+            score -= 0.2
+        if content_farm_ratio > 0.4:
+            score -= 0.15
+        return max(0.0, min(1.0, score))
+    def _calculate_diversity_quality(self, domains: List[str]) -> Dict:
+        """Calculate diversity quality - returns score and entropy
+        Entropy here is a statistical measure of domain diversity,
+        helping to assess whether a claim’s spread is broad and
+        organic or narrow and potentially suspicious.
+        """
+        if len(domains) < 2:
+            return {"score": 0.0, "entropy": 0.0}
+        domain_counts = Counter(domains)
+        unique_domains = len(set(domains))
+        total_sources = len(domains)
+        # Calculate Shannon entropy
+        entropy = 0.0
+        for count in domain_counts.values():
+            p = count / total_sources
+            if p > 0:
+                entropy -= p * math.log2(p)
+        # Normalize entropy
+        max_entropy = math.log2(unique_domains) if unique_domains > 1 else 0
+        normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
+        # Base diversity score
+        diversity_score = normalized_entropy
+        # Detect artificial patterns
+        max_domain_share = max(domain_counts.values()) / total_sources
+        # Single domain dominance penalty
+        if max_domain_share > 0.7 and unique_domains > 3:
+            diversity_score -= 0.4
+        # Artificial diversity penalty
+        single_mention_domains = sum(
+            1 for count in domain_counts.values() if count == 1
+        )
+        if single_mention_domains > total_sources * 0.8 and total_sources > 10:
+            diversity_score -= 0.3
+        if 0.3 <= normalized_entropy <= 0.8 and unique_domains >= 3:
+            diversity_score += 0.2
+        return {
+            "score": max(0.0, min(1.0, diversity_score)),
+            "entropy": normalized_entropy,
+        }
+    def analyze_propagation_pattern(self, search_results: List[str]) -> Dict:
+        """Analyze propagation pattern - returns score, and domain_diversity"""
+        domains = []
+        valid_urls = 0
+        for url in search_results:
+            domain = extract_domain(url)
+            if domain and domain not in ["", "localhost"]:
+                domains.append(domain)
+                valid_urls += 1
+        # Early return for insufficient data
+        if len(domains) < self.min_sources_threshold:
+            return {"score": 0.1, "domain_diversity": 0.0}
+        # Perform analysis
+        credibility_score = self._calculate_domain_credibility_score(domains)
+        diversity_analysis = self._calculate_diversity_quality(domains)
+        # Calculate weighted final score
+        final_score = (
+            credibility_score * self.weights["domain_credibility"]
+            + diversity_analysis["score"] * self.weights["diversity_quality"]
+        )
+        # Additional quality adjustments
+        unique_domains = len(set(domains))
+        trusted_count = sum(1 for d in domains if d in TRUSTED_DOMAINS)
+        suspicious_count = sum(1 for d in domains if d in SUSPICIOUS_DOMAINS)
+        if trusted_count >= 3 and suspicious_count == 0:
+            final_score += 0.1
+        elif suspicious_count > trusted_count:
+            final_score -= 0.15
+        if unique_domains < self.min_unique_domains:
+            final_score = min(final_score, 0.3)
+        final_score = max(0.0, min(1.0, final_score))
+        return {
+            "score": round(final_score, 3),
+            "domain_diversity": round(diversity_analysis["entropy"], 3),
+        }
+if __name__ == "__main__":
+    analyzer = NetworkAnalyzer()
+    # Test Case 1: Mixed credible and suspicious domains
+    search_results_1 = [
+        "https://reuters.com/news/article1",
+        "https://bbc.com/news/article2",
+        "https://ghanaweb.com/article3",
+        "https://cnn.com/article4",
+        "https://naturalnews.com/fake1",
+        "https://infowars.com/fake2",
+    ]
+    print("\nTest Case 1: Mixed credible and suspicious")
+    result1 = analyzer.analyze_propagation_pattern(search_results_1)
+    print(f"Result: {result1}")
+    # Test Case 2: Mostly trusted domains
+    search_results_2 = [
+        "https://bbc.com/article",
+        "https://cnn.com/article",
+        "https://reuters.com/article",
+        "https://nytimes.com/article",
+        "https://ghanaweb.com/article",
+    ]
+    print("\nTest Case 2: Mostly trusted domains")
+    result2 = analyzer.analyze_propagation_pattern(search_results_2)
+    print(f"Result: {result2}")
+    # Test Case 3: Mostly suspicious and content farms
+    search_results_3 = [
+        "https://infowars.com/fake",
+        "https://naturalnews.com/fake",
+        "https://clickhole.com/funny",
+        "https://upworthy.com/clickbait",
+        "https://shared.com/share",
+    ]
+    print("\nTest Case 3: Suspicious and content farm heavy")
+    result3 = analyzer.analyze_propagation_pattern(search_results_3)
+    print(f"Result: {result3}")
+    # Test Case 4: Low diversity (same domain repeated)
+    search_results_4 = [
+        "https://buzzfeed.com/post1",
+        "https://buzzfeed.com/post2",
+        "https://buzzfeed.com/post3",
+        "https://buzzfeed.com/post4",
+        "https://buzzfeed.com/post5",
+    ]
+    print("\nTest Case 4: Low domain diversity")
+    result4 = analyzer.analyze_propagation_pattern(search_results_4)
+    print(f"Result: {result4}")
+    # Test Case 5: Not enough sources
+    search_results_5 = ["https://cnn.com/article1"]
+    print("\nTest Case 5: Insufficient results")
+    result5 = analyzer.analyze_propagation_pattern(search_results_5)
+    print(f"Result: {result5}")

deploy/main/predict_clickbait.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import pickle
+import numpy as np
+from deploy.utils.clickbait_utils import extract_enhanced_features
+class ClickbaitPredictor:
+    def __init__(self, model_dir="./models/clickbait"):
+        try:
+            with open(f"{model_dir}/logistic_regression_model.pkl", "rb") as f:
+                self.classifier = pickle.load(f)
+            with open(f"{model_dir}/tfidf_vectorizer.pkl", "rb") as f:
+                self.tfidf_vectorizer = pickle.load(f)
+            with open(f"{model_dir}/feature_info.pkl", "rb") as f:
+                self.clickbait_indicators = pickle.load(f)
+            print("Model loaded successfully")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            self.classifier = None
+            self.tfidf_vectorizer = None
+            self.clickbait_indicators = None
+    def predict(self, headline, threshold=0.5):
+        if self.classifier is None or self.tfidf_vectorizer is None:
+            raise RuntimeError("Model or vectorizer not loaded.")
+        tfidf_features = self.tfidf_vectorizer.transform([headline])
+        handcrafted_features = extract_enhanced_features([headline])
+        combined_features = np.hstack((tfidf_features.toarray(), handcrafted_features))
+        lr_probs = self.classifier.predict_proba(combined_features)[0]
+        lr_score = lr_probs[1]
+        is_clickbait = lr_score >= threshold
+        confidence = lr_score if is_clickbait else (1 - lr_score)
+        return is_clickbait, lr_score, confidence
+if __name__ == "__main__":
+    predictor = ClickbaitPredictor()
+    while True:
+        headline = input("Enter a headline to check clickbait score: ")
+        is_clickbait, score, confidence = predictor.predict(headline)
+        status = "CLICKBAIT" if is_clickbait else "NORMAL"
+        print(f"{status} (Score: {score:.3f}, Confidence: {confidence:.3f})")
+        print(f"  '{headline}'")
+        print()

deploy/main/source_credibility_analyzer.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import re
+from urllib.parse import urlparse
+from deploy.utils.general_utils import TRUSTED_DOMAINS, SUSPICIOUS_DOMAINS
+class SourceCredibilityAnalyzer:
+    """Simplified source credibility analyzer - returns only the score"""
+    def __init__(self):
+        # Weighted scoring system
+        self.weights = {
+            "tld_credibility": 0.4,
+            "domain_structure": 0.3,
+            "news_indicators": 0.2,
+            "domain_age_indicators": 0.15,
+            "subdomain_analysis": 0.1,
+        }
+        # Suspicious patterns
+        self._suspicious_patterns = [
+            (re.compile(r"\d{4,}"), 0.8),
+            (
+                re.compile(r"(fake|hoax|scam|click|bait|spam|phishing)", re.IGNORECASE),
+                0.9,
+            ),
+            (re.compile(r"[a-z]+\d+[a-z]+\d+", re.IGNORECASE), 0.7),
+            (re.compile(r"(xxx|porn|adult|sex)", re.IGNORECASE), 0.6),
+            (re.compile(r"(free|download|crack|hack)", re.IGNORECASE), 0.5),
+            (re.compile(r"[0-9]{1,3}-[0-9]{1,3}-[0-9]{1,3}", re.IGNORECASE), 0.8),
+            (re.compile(r"(temp|tmp|test|demo)", re.IGNORECASE), 0.4),
+        ]
+        # TLD scores
+        self.tld_scores = {
+            ".edu": 0.9,
+            ".gov": 0.95,
+            ".mil": 0.9,
+            ".org": 0.7,
+            ".ac.uk": 0.8,
+            ".edu.au": 0.8,
+            ".com": 0.3,
+            ".net": 0.25,
+            ".co.uk": 0.4,
+            ".com.au": 0.4,
+            ".de": 0.4,
+            ".fr": 0.4,
+            ".ca": 0.4,
+            ".jp": 0.4,
+            ".info": 0.1,
+            ".biz": 0.1,
+            ".name": 0.05,
+            ".tk": -0.6,
+            ".ml": -0.6,
+            ".ga": -0.6,
+            ".cf": -0.6,
+            ".pw": -0.4,
+            ".top": -0.3,
+            ".click": -0.5,
+            ".download": -0.4,
+            ".stream": -0.3,
+            ".review": -0.2,
+            ".date": -0.3,
+            ".racing": -0.4,
+        }
+        # News indicators
+        self.news_indicators = {
+            "news": 0.3,
+            "times": 0.3,
+            "post": 0.25,
+            "herald": 0.2,
+            "gazette": 0.2,
+            "journal": 0.2,
+            "tribune": 0.2,
+            "chronicle": 0.2,
+            "report": 0.15,
+            "press": 0.2,
+            "media": 0.1,
+            "broadcast": 0.15,
+            "reuters": 0.4,
+            "associated": 0.3,
+            "wire": 0.2,
+        }
+    def analyze_domain_credibility(self, domain: str) -> float:
+        """Get credibility score for domain"""
+        domain = domain.lower().strip()
+        # Handle URLs by extracting domain
+        if domain.startswith(("http://", "https://")):
+            parsed = urlparse(domain)
+            domain = parsed.netloc.lower()
+        # Remove www prefix
+        if domain.startswith("www."):
+            domain = domain[4:]
+        # Check trusted domains
+        if domain in TRUSTED_DOMAINS:
+            return 0.95
+        # Check suspicious domains
+        if domain in SUSPICIOUS_DOMAINS:
+            return 0.05
+        # Calculate score components
+        tld_score = self._get_tld_score(domain)
+        structure_score = self._get_structure_score(domain)
+        news_score = self._get_news_score(domain)
+        establishment_score = self._get_establishment_score(domain)
+        subdomain_score = self._get_subdomain_score(domain)
+        # Start with base score and apply weighted components
+        base_score = 0.2
+        final_score = base_score
+        final_score += tld_score * self.weights["tld_credibility"]
+        final_score += structure_score * self.weights["domain_structure"]
+        final_score += news_score * self.weights["news_indicators"]
+        final_score += establishment_score * self.weights["domain_age_indicators"]
+        final_score += subdomain_score * self.weights["subdomain_analysis"]
+        return max(0.0, min(1.0, round(final_score, 2)))
+    def _get_tld_score(self, domain: str) -> float:
+        """Get TLD score"""
+        for tld, score in self.tld_scores.items():
+            if domain.endswith(tld):
+                return score
+        return -0.1  # Unknown TLD
+    def _get_structure_score(self, domain: str) -> float:
+        """Get domain structure score"""
+        suspicious_score = 0
+        for pattern, severity in self._suspicious_patterns:
+            if pattern.search(domain):
+                suspicious_score -= severity * 0.3
+        if len(domain.split(".")[0]) < 3:
+            suspicious_score -= 0.2
+        if domain.count("-") > 2:
+            suspicious_score -= 0.15
+        return max(-0.8, suspicious_score)
+    def _get_news_score(self, domain: str) -> float:
+        """Get news indicators score"""
+        score = 0
+        for indicator, weight in self.news_indicators.items():
+            if indicator in domain:
+                score += weight
+        return min(0.4, score)
+    def _get_establishment_score(self, domain: str) -> float:
+        """Get establishment indicators score"""
+        score = 0
+        if any(
+            word in domain
+            for word in ["university", "college", "institute", "foundation"]
+        ):
+            score += 0.3
+        if any(word in domain for word in ["library", "museum", "archive"]):
+            score += 0.2
+        if any(word in domain for word in ["research", "study", "science"]):
+            score += 0.15
+        return min(0.3, score)
+    def _get_subdomain_score(self, domain: str) -> float:
+        """Get subdomain score"""
+        parts = domain.split(".")
+        if len(parts) <= 2:
+            return 0.1
+        elif len(parts) > 4:
+            return -0.15
+        else:
+            return 0
+if __name__ == "__main__":
+    analyzer = SourceCredibilityAnalyzer()
+    # domains_to_analyze = ["ghanaweb.com"]
+    domain = input("Enter a domain to check credibility: ")
+    # for domain in domains_to_analyze:
+    result = analyzer.analyze_domain_credibility(domain)
+    print(f"{domain} -> {result:.2f}")

deploy/utils/__init__.py ADDED Viewed

File without changes

deploy/utils/clickbait_utils.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import re
+import numpy as np
+clickbait_indicators = {
+    "curiosity_gap": [
+        "you won't believe",
+        "wait until you see",
+        "what happened next",
+        "the reason will shock you",
+        "this is why",
+        "here's what happened",
+        "the truth about",
+        "what nobody tells you",
+        "finally revealed",
+    ],
+    "emotional_triggers": [
+        "shocking",
+        "incredible",
+        "amazing",
+        "unbelievable",
+        "stunning",
+        "heartbreaking",
+        "hilarious",
+        "terrifying",
+        "adorable",
+        "outrageous",
+        "mind-blowing",
+        "jaw-dropping",
+        "breathtaking",
+    ],
+    "urgency_scarcity": [
+        "breaking",
+        "urgent",
+        "limited time",
+        "before it's too late",
+        "act now",
+        "don't miss",
+        "last chance",
+        "expires soon",
+    ],
+    "personal_relevance": [
+        "in your area",
+        "people like you",
+        "your age",
+        "based on your",
+        "you need to know",
+        "this affects you",
+        "for people who",
+    ],
+    "superlatives": [
+        "ultimate",
+        "perfect",
+        "best ever",
+        "greatest",
+        "worst",
+        "most amazing",
+        "incredible",
+        "unmatched",
+        "revolutionary",
+    ],
+    "numbers_lists": [
+        r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
+        r"one\s+(weird|simple|amazing)\s+trick",
+        r"\d+\s+minute[s]?",
+        r"in\s+\d+\s+(steps?|minutes?|days?)",
+    ],
+    "authority_social_proof": [
+        "doctors hate",
+        "experts don't want",
+        "celebrities use",
+        "scientists discovered",
+        "research shows",
+        "studies prove",
+    ],
+}
+def extract_enhanced_features(texts):
+    """Extract comprehensive handcrafted features"""
+    features = []
+    for text in texts:
+        if not isinstance(text, str):
+            text = str(text) if text is not None else ""
+        text_lower = text.lower()
+        feature_vector = []
+        # Clickbait pattern scores by category
+        for category, patterns in clickbait_indicators.items():
+            category_score = 0
+            for pattern in patterns:
+                if isinstance(pattern, str):
+                    if pattern in text_lower:
+                        category_score += 1
+                else:  # regex pattern
+                    if re.search(pattern, text_lower):
+                        category_score += 1
+            # Normalize by pattern count in category
+            normalized_score = min(category_score / len(patterns), 1.0)
+            feature_vector.append(normalized_score)
+        # Punctuation and formatting features
+        exclamation_ratio = text.count("!") / max(len(text), 1)
+        question_ratio = text.count("?") / max(len(text), 1)
+        caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
+        feature_vector.extend(
+            [
+                min(exclamation_ratio * 10, 1.0),
+                min(question_ratio * 10, 1.0),
+                min(caps_ratio * 5, 1.0),
+            ]
+        )
+        # Length and structure features
+        words = text.split()
+        word_count = len(words)
+        avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
+        feature_vector.extend(
+            [
+                min(word_count / 20, 1.0),  # Normalized word count
+                min(avg_word_length / 8, 1.0),  # Normalized avg word length
+                1.0 if word_count > 10 else 0.0,  # Long headline indicator
+            ]
+        )
+        # Semantic features
+        all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
+        number_count = len(
+            [word for word in words if any(char.isdigit() for char in word)]
+        )
+        feature_vector.extend(
+            [
+                min(all_caps_words / max(word_count, 1), 1.0),
+                min(number_count / max(word_count, 1), 1.0),
+            ]
+        )
+        features.append(feature_vector)
+    return np.array(features)

deploy/utils/content_extractor.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from newspaper import Article
+def extract_content(
+    url: str,
+    content_cache: dict,
+    cache_key: str,
+    get_user_agent,
+    timeout: int,
+    cache_size: int,
+) -> str:
+    """Enhanced content extraction with newspaper3k fallback to BeautifulSoup."""
+    if cache_key in content_cache:
+        return content_cache[cache_key]
+    try:
+        # Try newspaper3k first
+        article = Article(url)
+        article.download()
+        article.parse()
+        content = article.text
+        # If newspaper3k didn't get good content, fallback to BeautifulSoup
+        if not content or len(content.strip()) < 100:
+            content = _fallback_extraction(url, get_user_agent, timeout)
+        # Clean and normalize content
+        content = _clean_content(content)
+        content = content[:10000]  # Increased from 8000
+        # Cache result
+        if len(content_cache) >= cache_size:
+            oldest_key = next(iter(content_cache))
+            del content_cache[oldest_key]
+        content_cache[cache_key] = content
+        return content
+    except Exception:
+        # If newspaper3k fails, try BeautifulSoup fallback
+        try:
+            content = _fallback_extraction(url, get_user_agent, timeout)
+            content = _clean_content(content)
+            content = content[:10000]
+            if len(content_cache) >= cache_size:
+                oldest_key = next(iter(content_cache))
+                del content_cache[oldest_key]
+            content_cache[cache_key] = content
+            return content
+        except Exception:
+            return ""
+def _fallback_extraction(url: str, get_user_agent, timeout: int) -> str:
+    """Fallback extraction using BeautifulSoup."""
+    headers = {
+        "User-Agent": get_user_agent(),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+    }
+    time.sleep(0.5)
+    response = requests.get(url, headers=headers, timeout=timeout)
+    response.raise_for_status()
+    # Handle encoding
+    if response.encoding is None or response.encoding.lower() in ["iso-8859-1", "ascii"]:
+        response.encoding = "utf-8"
+    try:
+        html_content = response.text
+    except UnicodeDecodeError:
+        try:
+            html_content = response.content.decode("utf-8", errors="ignore")
+        except UnicodeDecodeError:
+            html_content = response.content.decode("latin-1", errors="replace")
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove irrelevant content
+    for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "iframe"]):
+        element.decompose()
+    # Extract content using selectors
+    content_selectors = [
+        "article",
+        "main",
+        '[role="main"]',
+        ".content",
+        ".article-content",
+        ".post-content",
+        ".entry-content",
+        ".article-body",
+    ]
+    extracted_text = ""
+    for selector in content_selectors:
+        elements = soup.select(selector)
+        if elements:
+            extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in elements])
+            break
+    if not extracted_text:
+        content_elements = soup.find_all(["p", "div"], class_=lambda x: x is None or "ad" not in str(x).lower())
+        extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in content_elements])
+    if not extracted_text:
+        extracted_text = soup.get_text(separator=" ", strip=True)
+    return extracted_text
+def _clean_content(content: str) -> str:
+    """Clean and normalize extracted content."""
+    # Clean problematic characters
+    content = content.replace("\ufffd", " ")
+    content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x84\x86-\x9f]", " ", content)
+    # Normalize unicode if available
+    try:
+        import unicodedata
+        content = unicodedata.normalize("NFKD", content)
+    except:
+        pass
+    # Normalize whitespace and clean
+    content = re.sub(r"\s+", " ", content).strip()
+    content = re.sub(r"[^\x20-\x7E\u00A0-\uFFFF]", " ", content)
+    return content

deploy/utils/general_utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import tldextract
+import re
+from multiprocessing import Pool, cpu_count
+from tqdm import tqdm
+# Trusted domains
+TRUSTED_DOMAINS = {
+    # 🌍 International Mainstream News
+    "abcnews.go.com",
+    "aljazeera.com",
+    "apnews.com",
+    "bbc.com",
+    "bloomberg.com",
+    "cbc.ca",
+    "cbsnews.com",
+    "cnn.com",
+    "dw.com",
+    "economist.com",
+    "euronews.com",
+    "forbes.com",
+    "ft.com",
+    "indiatimes.com",
+    "japantimes.co.jp",
+    "latimes.com",
+    "npr.org",
+    "nytimes.com",
+    "reuters.com",
+    "smh.com.au",
+    "theguardian.com",
+    "usatoday.com",
+    "washingtonpost.com",
+    "wsj.com",
+    "france24.com",
+    # 📰 Ghana-Specific News
+    "3news.com",
+    "adomonline.com",
+    "citinewsroom.com",
+    "ghanaweb.com",
+    "ghanaiantimes.com.gh",
+    "ghananewsagency.org",
+    "graphic.com.gh",
+    "modernghana.com",
+    "myjoyonline.com",
+    "peacefmonline.com",
+    "pulse.com.gh",
+    "starrfm.com.gh",
+    "thebftonline.com",
+    "yen.com.gh",
+    "nsmq.com.gh",
+    # ⚽ Sports News
+    "cbssports.com",
+    "espn.com",
+    "eurosport.com",
+    "fifa.com",
+    "footballghana.com",
+    "foxsports.com",
+    "ghanasoccernet.com",
+    "goal.com",
+    "nba.com",
+    "nbcsports.com",
+    "onefootball.com",
+    "skysports.com",
+    "sportinglife.com",
+    "supersport.com",
+    "tntsports.co.uk",
+    "theathletic.com",
+    "olympics.com",
+    # 🎬 Entertainment & Pop Culture
+    "billboard.com",
+    "deadline.com",
+    "entertainment.com",
+    "eonline.com",
+    "ew.com",
+    "hollywoodreporter.com",
+    "indiewire.com",
+    "people.com",
+    "rollingstone.com",
+    "thewrap.com",
+    "variety.com",
+    # 🧪 Science & Research
+    "eurekalert.org",
+    "medpagetoday.com",
+    "nasa.gov",
+    "nature.com",
+    "sciencealert.com",
+    "sciencenews.org",
+    "statnews.com",
+    # 🌐 Fact-Checking & Watchdogs
+    "africacheck.org",
+    "factcheck.org",
+    "fullfact.org",
+    "politifact.com",
+    "snopes.com",
+    # 🌍 Global & General Niche News
+    "asia.nikkei.com",
+    "globalissues.org",
+    "ipsnews.net",
+    "oecdobserver.org",
+    "rferl.org",
+    # 📰 African Regional News (non-Ghana)
+    "dailynation.africa",
+    "enca.com",
+    "ewn.co.za",
+    "monitor.co.ug",
+    "thecitizen.co.tz",
+    "businessinsider.com",
+    "africanews.com",
+    # 🎓 Academic & Policy Think Tanks
+    "brookings.edu",
+    "carnegieendowment.org",
+    "cfr.org",
+    "foreignpolicy.com",
+    "theconversation.com",
+}
+# Suspicious domains that often spread misinformation
+SUSPICIOUS_DOMAINS = {
+    "beforeitsnews.com",
+    "naturalnews.com",
+    "infowars.com",
+    "breitbart.com",
+    "dailystormer.com",
+    "zerohedge.com",
+    "activistpost.com",
+    "realfarmacy.com",
+    "healthnutnews.com",
+}
+def extract_domain(url):
+    """Extract domain from URL"""
+    ext = tldextract.extract(url)
+    return f"{ext.domain}.{ext.suffix}"
+_PATTERNS = [
+    (re.compile(r"\b[A-Z]+\s*\(Reuters\)\s*[-–—]?\s*", re.IGNORECASE), ""),
+    (re.compile(r"\(Reuters\)", re.IGNORECASE), ""),
+    (re.compile(r"Reuters", re.IGNORECASE), ""),
+    (
+        re.compile(
+            r"\b(?:WASHINGTON|NEW YORK|LONDON|PARIS|BERLIN|TOKYO|MOSCOW|BEIJING|DELHI)\s*[-–—]?\s*",
+            re.IGNORECASE,
+        ),
+        "",
+    ),
+    (re.compile(r"\b(?:AP|CNN|BBC|Fox News|NBC|CBS|ABC News)\b", re.IGNORECASE), ""),
+    (re.compile(r"\bBy\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", re.IGNORECASE), ""),
+    (re.compile(r"\S+@\S+\.\S+"), ""),
+    (re.compile(r"http[s]?://\S+"), ""),
+    (re.compile(r"[^a-zA-Z\s]"), " "),
+    (re.compile(r"\s+"), " "),
+]
+def remove_source_artifacts_fast(text):
+    """Optimized version of source artifact removal"""
+    if not isinstance(text, str) or len(text) < 10:
+        return ""
+    for pattern, replacement in _PATTERNS:
+        text = pattern.sub(replacement, text)
+    return text.strip().lower()
+def _process_text_chunk(text_chunk):
+    """Internal helper to process a chunk of texts in parallel"""
+    return [remove_source_artifacts_fast(text) for text in text_chunk]
+def parallel_preprocess(texts, n_jobs=None):
+    """Parallel preprocessing of texts using multiprocessing"""
+    if n_jobs is None:
+        n_jobs = min(cpu_count(), 8)
+    chunk_size = max(1, len(texts) // n_jobs)
+    chunks = [texts[i : i + chunk_size] for i in range(0, len(texts), chunk_size)]
+    print(
+        f"Processing {len(texts)} texts in {len(chunks)} chunks using {n_jobs} processes..."
+    )
+    with Pool(n_jobs) as pool:
+        results = list(
+            tqdm(
+                pool.imap(_process_text_chunk, chunks),
+                total=len(chunks),
+                desc="Preprocessing chunks",
+            )
+        )
+    processed_texts = []
+    for chunk_result in results:
+        processed_texts.extend(chunk_result)
+    return processed_texts

deploy/utils/url_filter.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import re
+def _is_pdf_or_download_url(url: str) -> bool:
+    """Check if URL points to a PDF or download file."""
+    url_lower = url.lower()
+    # Check for PDF in URL path
+    if url_lower.endswith(".pdf"):
+        return True
+    # Check for PDF in URL path with query parameters
+    if ".pdf?" in url_lower or ".pdf#" in url_lower:
+        return True
+    # Check for other document/download formats
+    download_extensions = [
+        ".doc",
+        ".docx",
+        ".xls",
+        ".xlsx",
+        ".ppt",
+        ".pptx",
+        ".zip",
+        ".rar",
+        ".tar",
+        ".gz",
+        ".7z",
+        ".mp3",
+        ".mp4",
+        ".avi",
+        ".mov",
+        ".wmv",
+        ".exe",
+        ".msi",
+        ".dmg",
+        ".pkg",
+        ".epub",
+        ".mobi",
+        ".djvu",
+    ]
+    for ext in download_extensions:
+        if url_lower.endswith(ext) or f"{ext}?" in url_lower or f"{ext}#" in url_lower:
+            return True
+    # Check for common download URL patterns
+    download_patterns = [
+        r"/download/",
+        r"/downloads/",
+        r"/attachments/",
+        r"/files/",
+        r"/uploads/",
+        r"/wp-content/uploads/",
+        r"/content/uploads/",
+        r"/assets/downloads/",
+        r"/documents/",
+        r"/pdfs/",
+        r"\.pdf$",
+        r"\.pdf\?",
+        r"\.pdf#",
+        r"attachment\.aspx",
+        r"download\.aspx",
+        r"getfile\.aspx",
+        r"viewdocument\.aspx",
+    ]
+    return any(re.search(pattern, url_lower) for pattern in download_patterns)
+def _is_corrupted_pdf_content(content: str) -> bool:
+    """Detect if content appears to be corrupted PDF text."""
+    if not content or len(content.strip()) < 10:
+        return False
+    # Common PDF corruption indicators
+    pdf_corruption_patterns = [
+        r"endstream\s+endobj",
+        r"obj\s*<[^>]*>\s*stream",
+        r"%PDF-\d+\.\d+",
+        r"xref\s+\d+",
+        r"trailer\s*<<",
+        r"startxref",
+        r"%%EOF",
+        r"stream\s+H\s+[^\w\s]{10,}",  # Stream followed by garbled text
+        r"[^\w\s]{20,}",  # Long sequences of non-word/space characters
+        r"obj\s+<\s*>\s*stream",
+        r"BT\s+/F\d+",  # PDF text object indicators
+        r"ET\s+Q\s+q",  # PDF graphics state operators
+    ]
+    corruption_score = 0
+    for pattern in pdf_corruption_patterns:
+        if re.search(pattern, content, re.IGNORECASE):
+            corruption_score += 1
+    # Check character distribution - PDFs often have weird character distributions
+    if len(content) > 50:
+        # Count non-printable or unusual characters
+        unusual_chars = sum(
+            1 for c in content if ord(c) > 127 or (ord(c) < 32 and c not in "\t\n\r ")
+        )
+        unusual_ratio = unusual_chars / len(content)
+        if unusual_ratio > 0.3:  # More than 30% unusual characters
+            corruption_score += 2
+    # Check for excessive special characters in a row
+    if re.search(r"[^\w\s]{15,}", content):
+        corruption_score += 1
+    # Check for PDF-specific garbled patterns
+    if re.search(r"[A-Za-z0-9]{2,}\s+[^\w\s]{5,}\s+[A-Za-z0-9]{2,}", content):
+        corruption_score += 1
+    return corruption_score >= 2

models/clickbait/feature_info.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:053e642a1e6692cd3ca116bb36aca8aab7c65f45ac07ccd75babd638debf07e3
+size 1126

models/clickbait/logistic_regression_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c03ee9383f2dd1fe51d335adbc292975fd051202d981cdff0805a16941b6f80
+size 40845

models/clickbait/tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e7a97703602d9eafb046d14e9dd776fce4dbc050cf7ca201fedbc4dd31b13c
+size 186468

requirements.txt CHANGED Viewed

@@ -1,3 +1,13 @@
 sentence-transformers==4.1.0
 torch==2.7.1
 textblob==0.19.0

+beautifulsoup4==4.13.4
+googlesearch-python==1.3.0
+numpy==2.0.2
+requests==2.32.3
+tldextract==5.3.0
+tqdm==4.67.1
+newspaper3k
+lxml_html_clean
+nltk==3.9.1
 sentence-transformers==4.1.0
 torch==2.7.1
+scikit-learn==1.6.1
 textblob==0.19.0

semantic_similarity.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+from typing import List
+import torch
+from sentence_transformers import SentenceTransformer, util
+from textblob import TextBlob
+model = SentenceTransformer("paraphrase-MiniLM-L12-v2")
+model.eval()
+def calculate_semantic_similarity(
+    claim: str, sentences: List[str], similarity_threshold: float = 0.4
+) -> float:
+    """
+    Calculates a weighted score representing how well a list of sentences supports a claim.
+    Args:
+        claim (str): The claim to be verified.
+        sentences (List[str]): A list of sentences to check against the claim.
+        similarity_threshold (float, optional): The minimum similarity score for a
+                                               sentence to be considered "supporting". Defaults to 0.5.
+    Returns:
+        float: A weighted score between 0.0 and 1.0.
+    """
+    if not sentences:
+        return 0.0
+    all_scores = []
+    with torch.no_grad():
+        claim_embedding = model.encode(claim, show_progress_bar=False)
+        sentence_embeddings = model.encode(sentences, show_progress_bar=False)
+        cosine_scores = util.cos_sim(claim_embedding, sentence_embeddings)[0]
+        claim_sentiment = TextBlob(claim).sentiment.polarity
+        for i, sentence in enumerate(sentences):
+            similarity = cosine_scores[i].item()
+            sentence_sentiment = TextBlob(sentence).sentiment.polarity
+            if claim_sentiment * sentence_sentiment > 0:
+                similarity *= 1.1
+            elif claim_sentiment * sentence_sentiment < 0:
+                similarity *= 0.9
+            # print(f"Sentence: {sentence}\nSimilarity: {similarity:.2f}\n")
+            similarity = max(0.0, min(1.0, similarity))
+            all_scores.append(similarity)
+    supporting_scores = [s for s in all_scores if s >= similarity_threshold]
+    proportion_supporting = len(supporting_scores) / len(sentences)
+    if proportion_supporting >= 0.30:
+        final_score = sum(supporting_scores) / len(supporting_scores)
+    else:
+        average_all_scores = sum(all_scores) / len(all_scores)
+        # penalty = 0.80  # 20% reduction
+        final_score = average_all_scores  # * penalty
+    return final_score
+if __name__ == "__main__":
+    while True:
+        claim_to_verify = input("Enter claim to verify: ")
+        evidence = input("Enter evidence sentences: ")
+        evidence_sentences = [
+            "The recent legislation is projected to stimulate significant economic growth.",  # High similarity
+            "Market analysts are optimistic about the financial future following the announcement.",  # High similarity
+            "However, some critics argue that the policy might lead to unforeseen inflation.",  # Low similarity
+            "The stock market reacted positively, showing a slight increase.",  # Medium similarity
+            "This is considered a poor decision for the nation's financial stability by some experts.",  # Opposing sentiment
+            "The primary goal of the initiative is to create jobs and encourage consumer spending.",  # High similarity
+            "Unemployment rates are expected to decline in the coming months.",  # High similarity
+            "There has been some public disapproval regarding the policy's rollout.",  # Low similarity
+            "This will surely lead to a stronger and more resilient economy.",  # High similarity
+            "Financial experts have voiced concerns about the potential long-term consequences.",  # Opposing sentiment
+        ]
+        final_score = calculate_semantic_similarity(claim_to_verify, [evidence.strip()])
+        print(f"The final weighted support score for the claim is: {final_score:.4f}")
+        if final_score > 0.65:
+            print("Interpretation: The claim is strongly supported by the evidence. ✅")
+        elif final_score > 0.4:
+            print(
+                "Interpretation: The claim has moderate support from the evidence. 🤔"
+            )
+        else:
+            print("Interpretation: The claim has weak support from the evidence. ❌")