Spaces:
Running
Running
| import logging | |
| import re | |
| from typing import Tuple, List, Dict, Optional | |
| import os | |
| import time | |
| # Set up logging | |
| logger = logging.getLogger("misinformation_detector") | |
| # Define categories and their keywords | |
| CLAIM_CATEGORIES = { | |
| "ai": [ | |
| # General AI terms | |
| "AI", "artificial intelligence", "machine learning", "ML", "deep learning", "DL", | |
| "neural network", "neural nets", "generative AI", "GenAI", "AGI", "artificial general intelligence", | |
| "transformer", "attention mechanism", "fine-tuning", "pre-training", "training", "inference", | |
| # AI Models and Architectures | |
| "language model", "large language model", "LLM", "foundation model", "multimodal model", | |
| "vision language model", "VLM", "text-to-speech", "TTS", "speech-to-text", "STT", | |
| "text-to-image", "image-to-text", "diffusion model", "generative model", "discriminative model", | |
| "GPT", "BERT", "T5", "PaLM", "Claude", "Llama", "Gemini", "Mistral", "Mixtral", "Stable Diffusion", | |
| "Dall-E", "Midjourney", "Sora", "transformer", "MoE", "mixture of experts", "sparse model", | |
| "dense model", "encoder", "decoder", "encoder-decoder", "autoencoder", "VAE", | |
| "mixture of experts", "MoE", "sparse MoE", "switch transformer", "gated experts", | |
| "routing network", "expert routing", "pathways", "multi-query attention", "multi-head attention", | |
| "rotary position embedding", "RoPE", "grouped-query attention", "GQA", "flash attention", | |
| "state space model", "SSM", "mamba", "recurrent neural network", "RNN", "LSTM", "GRU", | |
| "convolutional neural network", "CNN", "residual connection", "skip connection", "normalization", | |
| "layer norm", "group norm", "batch norm", "parameter efficient fine-tuning", "PEFT", | |
| "LoRA", "low-rank adaptation", "QLoRA", "adapters", "prompt tuning", "prefix tuning", | |
| # AI Learning Paradigms | |
| "supervised learning", "unsupervised learning", "reinforcement learning", "RL", | |
| "meta-learning", "transfer learning", "federated learning", "self-supervised learning", | |
| "semi-supervised learning", "few-shot learning", "zero-shot learning", "one-shot learning", | |
| "contrastive learning", "curriculum learning", "imitation learning", "active learning", | |
| "reinforcement learning from human feedback", "RLHF", "direct preference optimization", "DPO", | |
| "constitutional AI", "red teaming", "adversarial training", "GAN", "generative adversarial network", | |
| "diffusion", "latent diffusion", "flow-based model", "variational autoencoder", "VAE", | |
| # AI Capabilities and Applications | |
| "natural language processing", "NLP", "computer vision", "CV", "speech recognition", | |
| "text generation", "image generation", "video generation", "multimodal", "multi-modal", | |
| "recommendation system", "recommender system", "chatbot", "conversational AI", | |
| "sentiment analysis", "entity recognition", "semantic search", "vector search", "embedding", | |
| "classification", "regression", "clustering", "anomaly detection", "agent", "AI agent", | |
| "autonomous agent", "agentic", "RAG", "retrieval augmented generation", "tool use", | |
| "function calling", "reasoning", "chain-of-thought", "CoT", "tree-of-thought", "ToT", | |
| "planning", "decision making", "multi-agent", "agent swarm", "multi-agent simulation", | |
| # AI Technical Terms | |
| "token", "tokenizer", "tokenization", "embedding", "vector", "prompt", "prompt engineering", | |
| "context window", "parameter", "weights", "bias", "activation function", "loss function", | |
| "gradient descent", "backpropagation", "epoch", "batch", "mini-batch", "regularization", | |
| "dropout", "overfitting", "underfitting", "hyperparameter", "latent space", "latent variable", | |
| "feature extraction", "dimensionality reduction", "quantization", "pruning", | |
| "fine-tuning", "transfer learning", "knowledge distillation", "int4", "int8", "bfloat16", | |
| "float16", "mixed precision", "GPTQ", "AWQ", "GGUF", "GGML", "KV cache", "speculative decoding", | |
| "beam search", "greedy decoding", "temperature", "top-k", "top-p", "nucleus sampling", | |
| # AI Tools and Frameworks | |
| "TensorFlow", "PyTorch", "JAX", "Keras", "Hugging Face", "Transformers", "Diffusers", | |
| "LangChain", "Llama Index", "OpenAI", "Anthropic", "NVIDIA", "GPU", "TPU", "IPU", "NPU", "CUDA", | |
| "MLOps", "model monitoring", "model deployment", "model serving", "inference endpoint", | |
| "vLLM", "TGI", "text generation inference", "triton", "onnx", "tensorRT", | |
| # AI Ethics and Concerns | |
| "AI ethics", "responsible AI", "AI safety", "AI alignment", "AI governance", | |
| "bias", "fairness", "interpretability", "explainability", "XAI", "transparency", | |
| "hallucination", "toxicity", "safe deployment", "AI risk", "AI capabilities", | |
| "alignment tax", "red teaming", "jailbreak", "prompt injection", "data poisoning", | |
| # AI Companies and Organizations | |
| "OpenAI", "Anthropic", "Google DeepMind", "Meta AI", "Microsoft", "NVIDIA", | |
| "Hugging Face", "Mistral AI", "Cohere", "AI21 Labs", "Stability AI", "Midjourney", | |
| "EleutherAI", "Allen AI", "DeepMind", "Character AI", "Inflection AI", "xAI" | |
| ], | |
| "science": [ | |
| # General scientific terms | |
| "study", "research", "scientist", "scientific", "discovered", "experiment", | |
| "laboratory", "clinical", "trial", "hypothesis", "theory", "evidence-based", | |
| "peer-reviewed", "journal", "publication", "finding", "breakthrough", "innovation", | |
| "discovery", "analysis", "measurement", "observation", "empirical", | |
| # Biology and medicine | |
| "biology", "chemistry", "physics", "genetics", "genomics", "DNA", "RNA", | |
| "medicine", "gene", "protein", "molecule", "cell", "brain", "neuro", | |
| "cancer", "disease", "cure", "treatment", "vaccine", "health", "medical", | |
| "pharmaceutical", "drug", "therapy", "symptom", "diagnosis", "prognosis", | |
| "patient", "doctor", "hospital", "clinic", "surgery", "immune", "antibody", | |
| "virus", "bacteria", "pathogen", "infection", "epidemic", "pandemic", | |
| "organism", "evolution", "mutation", "chromosome", "enzyme", "hormone", | |
| # Physics and astronomy | |
| "quantum", "particle", "atom", "nuclear", "electron", "neutron", "proton", | |
| "atomic", "subatomic", "molecular", "energy", "matter", "mass", "force", | |
| "space", "NASA", "telescope", "planet", "exoplanet", "moon", "lunar", "mars", | |
| "star", "galaxy", "cosmic", "astronomical", "universe", "solar", "celestial", | |
| "orbit", "gravitational", "gravity", "relativity", "quantum mechanics", | |
| "string theory", "dark matter", "dark energy", "black hole", "supernova", | |
| "radiation", "radioactive", "isotope", "fission", "fusion", "accelerator", | |
| # Environmental science | |
| "climate", "carbon", "environment", "ecosystem", "species", "extinct", | |
| "endangered", "biodiversity", "conservation", "sustainable", "renewable", | |
| "fossil fuel", "greenhouse", "global warming", "polar", "ice cap", "glacier", | |
| "ozone", "atmosphere", "weather", "meteorology", "geology", "earthquake", | |
| "volcanic", "ocean", "marine", "coral reef", "deforestation", "pollution", | |
| # Math and computer science (non-AI specific) | |
| "equation", "formula", "theorem", "calculus", "statistical", "probability", | |
| "variable", "matrix", "optimization", | |
| # Organizations | |
| "CERN", "NIH", "CDC", "WHO", "NOAA", "ESA", "SpaceX", "Blue Origin", "JPL", | |
| "laboratory", "institute", "university", "academic", "faculty", "professor", | |
| # Science tools | |
| "Matlab", "SPSS", "SAS", "ImageJ", "LabVIEW", "ANSYS", "Cadence", "Origin", | |
| "Avogadro", "ChemDraw", "Mathematica", "Wolfram Alpha", "COMSOL", "LAMMPS", | |
| "VASP", "Gaussian", "GIS", "ArcGIS", "QGIS", "Maple", "R Studio" | |
| ], | |
| "technology": [ | |
| # General tech terms | |
| "computer", "hardware", "internet", "cyber", "digital", "tech", | |
| "robot", "automation", "autonomous", "code", "programming", "data", "cloud", | |
| "server", "network", "encryption", "blockchain", "crypto", "bitcoin", "ethereum", | |
| "technology", "breakthrough", "prototype", "dataset", | |
| "engineering", "technical", "specification", "feature", "functionality", | |
| "interface", "system", "infrastructure", "integration", "implementation", | |
| # Devices and hardware | |
| "smartphone", "device", "gadget", "laptop", "desktop", "tablet", "wearable", | |
| "smartwatch", "IoT", "internet of things", "sensor", "chip", "semiconductor", | |
| "processor", "CPU", "GPU", "memory", "RAM", "storage", "hard drive", "SSD", | |
| "electronic", "circuit", "motherboard", "component", "peripheral", "accessory", | |
| "display", "screen", "touchscreen", "camera", "lens", "microphone", "speaker", | |
| "battery", "charger", "wireless", "bluetooth", "WiFi", "router", "modem", | |
| # Software and internet | |
| "algorithm", "app", "application", "platform", "website", "online", "web", "browser", | |
| "operating system", "Windows", "macOS", "Linux", "Android", "iOS", "software", | |
| "program", "code", "coding", "development", "framework", "library", "API", | |
| "backend", "frontend", "full-stack", "developer", "programmer", "function", | |
| "database", "SQL", "NoSQL", "cloud computing", "SaaS", "PaaS", "IaaS", | |
| "DevOps", "agile", "scrum", "sprint", "version control", "git", "repository", | |
| # Communications and networking | |
| "5G", "6G", "broadband", "fiber", "network", "wireless", "cellular", "mobile", | |
| "telecommunications", "telecom", "transmission", "bandwidth", "latency", | |
| "protocol", "IP address", "DNS", "server", "hosting", "data center", | |
| # Company and product names | |
| "Apple", "Google", "Microsoft", "Amazon", "Facebook", "Meta", "Tesla", | |
| "IBM", "Intel", "AMD", "Nvidia", "Qualcomm", "Cisco", "Oracle", "SAP", | |
| "Huawei", "Samsung", "Sony", "LG", "Dell", "HP", "Lenovo", "Xiaomi", | |
| "iPhone", "iPad", "MacBook", "Surface", "Galaxy", "Pixel", "Windows", | |
| "Android", "iOS", "Chrome", "Firefox", "Edge", "Safari", "Office", | |
| "Azure", "AWS", "Google Cloud", "Gmail", "Outlook", "Teams", "Zoom", | |
| # Advanced technologies | |
| "VR", "AR", "XR", "virtual reality", "augmented reality", "mixed reality", | |
| "metaverse", "3D printing", "additive manufacturing", "quantum computing", | |
| "nanotechnology", "biotechnology", "electric vehicle", "self-driving", | |
| "autonomous vehicle", "drone", "UAV", "robotics", "cybersecurity", | |
| # Social media | |
| "social media", "social network", "Facebook", "Instagram", "Twitter", "X", | |
| "LinkedIn", "TikTok", "Snapchat", "YouTube", "Pinterest", "Reddit", | |
| "streaming", "content creator", "influencer", "follower", "like", "share", | |
| "post", "tweet", "user-generated", "viral", "trending", "engagement", | |
| # Technology tools | |
| "NumPy", "Pandas", "Matplotlib", "Seaborn", "Scikit-learn", "Jupyter", | |
| "Visual Studio", "VS Code", "IntelliJ", "PyCharm", "Eclipse", "Android Studio", | |
| "Xcode", "Docker", "Kubernetes", "Jenkins", "Ansible", "Terraform", "Vagrant", | |
| "AWS CLI", "Azure CLI", "GCP CLI", "PowerShell", "Bash", "npm", "pip", "conda", | |
| "React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Spring", "Laravel", | |
| "PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "Kafka", "RabbitMQ", | |
| # Optimization terms | |
| "efficiency", "performance tuning", "benchmarking", "profiling", | |
| "refactoring", "scaling", "bottleneck", "throughput", "latency reduction", | |
| "response time", "caching", "load balancing", "distributed computing", | |
| "parallel processing", "concurrency", "asynchronous", "memory management" | |
| ], | |
| "politics": [ | |
| # Government structure | |
| "president", "prime minister", "government", "parliament", "congress", | |
| "senate", "house", "representative", "minister", "secretary", "cabinet", | |
| "administration", "mayor", "governor", "politician", "official", "authority", | |
| "federal", "state", "local", "municipal", "county", "city", "town", | |
| "constituency", "district", "precinct", "ward", "judiciary", "executive", | |
| "legislative", "branch", "checks and balances", "separation of powers", | |
| # Political activities | |
| "election", "campaign", "vote", "voter", "ballot", "polling", | |
| "political", "politics", "debate", "speech", "address", "press conference", | |
| "approval rating", "opinion poll", "candidate", "incumbent", "challenger", | |
| "primary", "caucus", "convention", "delegate", "nomination", "campaign trail", | |
| "fundraising", "lobbying", "advocacy", "activism", "protest", "demonstration", | |
| # Political ideologies | |
| "democracy", "democratic", "republican", "conservative", "liberal", | |
| "progressive", "left-wing", "right-wing", "centrist", "moderate", | |
| "socialist", "capitalist", "communist", "libertarian", "populist", | |
| "nationalist", "globalist", "isolationist", "hawk", "dove", | |
| "ideology", "partisan", "bipartisan", "coalition", "majority", "minority", | |
| # Laws and regulations | |
| "bill", "law", "legislation", "regulation", "policy", "statute", "code", | |
| "amendment", "reform", "repeal", "enact", "implement", "enforce", | |
| "constitutional", "unconstitutional", "legal", "illegal", "legalize", | |
| "criminalize", "deregulate", "regulatory", "compliance", "mandate", | |
| # Judicial and legal | |
| "court", "supreme", "justice", "judge", "ruling", "decision", "opinion", | |
| "case", "lawsuit", "litigation", "plaintiff", "defendant", "prosecutor", | |
| "attorney", "lawyer", "advocate", "judicial review", "precedent", | |
| "constitution", "amendment", "rights", "civil rights", "human rights", | |
| # International relations | |
| "treaty", "diplomatic", "diplomacy", "relations", | |
| "foreign policy", "domestic policy", "UN", "NATO", "EU", "United Nations", | |
| "sanctions", "embargo", "tariff", "trade war", "diplomat", "embassy", | |
| "consulate", "ambassador", "delegation", "summit", "bilateral", "multilateral", | |
| "alliance", "ally", "adversary", "geopolitical", "sovereignty", "regime", | |
| # Security and defense | |
| "national security", "homeland security", "defense", "military", "armed forces", | |
| "army", "navy", "air force", "marines", "coast guard", "intelligence", | |
| "CIA", "FBI", "NSA", "Pentagon", "war", "conflict", "peacekeeping", | |
| "terrorism", "counterterrorism", "insurgency", "nuclear weapon", "missile", | |
| "disarmament", "nonproliferation", "surveillance", "espionage", | |
| # Political institutions | |
| "White House", "Kremlin", "Downing Street", "Capitol Hill", "Westminster", | |
| "United Nations", "European Union", "NATO", "World Bank", "IMF", "WTO", | |
| "ASEAN", "African Union", "BRICS", "G7", "G20", | |
| # Political parties and movements | |
| "Democrat", "Republican", "Labour", "Conservative", "Green Party", | |
| "Socialist", "Communist", "Libertarian", "Independent", "Tea Party", | |
| "progressive movement", "civil rights movement", "womens rights", | |
| "LGBTQ rights", "Black Lives Matter", "environmental movement" | |
| ], | |
| "business": [ | |
| # Companies and organization types | |
| "company", "corporation", "business", "startup", "firm", "enterprise", | |
| "corporate", "industry", "sector", "conglomerate", "multinational", | |
| "organization", "entity", "private", "public", "incorporated", "LLC", | |
| "partnership", "proprietorship", "franchise", "subsidiary", "parent company", | |
| "headquarters", "office", "facility", "plant", "factory", "warehouse", | |
| "retail", "wholesale", "ecommerce", "brick-and-mortar", "chain", "outlet", | |
| # Business roles and management | |
| "executive", "CEO", "CFO", "CTO", "COO", "CMO", "CIO", "CHRO", "chief", | |
| "director", "board", "chairman", "chairwoman", "chairperson", "president", | |
| "vice president", "senior", "junior", "manager", "management", "supervisor", | |
| "founder", "entrepreneur", "owner", "shareholder", "stakeholder", | |
| "employee", "staff", "workforce", "personnel", "human resources", "HR", | |
| "recruit", "hire", "layoff", "downsizing", "restructuring", "reorganization", | |
| "leadership", | |
| # Financial terms | |
| "profit", "revenue", "sales", "income", "earnings", "EBITDA", "turnover", | |
| "loss", "deficit", "expense", "cost", "overhead", "margin", "markup", | |
| "budget", "forecast", "projection", "estimate", "actual", "variance", | |
| "balance sheet", "income statement", "cash flow", "P&L", "liquidity", | |
| "solvency", "asset", "liability", "equity", "debt", "leverage", "capital", | |
| "working capital", "cash", "funds", "money", "payment", "transaction", | |
| # Markets and trading | |
| "market", "stock", "share", "bond", "security", "commodity", "futures", | |
| "option", "derivative", "forex", "foreign exchange", "currency", "crypto", | |
| "trader", "trading", "buy", "sell", "long", "short", "position", "portfolio", | |
| "diversification", "hedge", "risk", "return", "yield", "dividend", "interest", | |
| "bull market", "bear market", "correction", "crash", "rally", "volatile", | |
| "volatility", "index", "benchmark", "Dow Jones", "NASDAQ", "S&P 500", "NYSE", | |
| # Investment and funding | |
| "investor", "investment", "fund", "mutual fund", "ETF", "hedge fund", | |
| "private equity", "venture", "venture capital", "VC", "angel investor", | |
| "seed", "Series A", "Series B", "Series C", "funding", "financing", | |
| "loan", "credit", "debt", "equity", "fundraising", "crowdfunding", | |
| "IPO", "initial public offering", "going public", "listed", "delisted", | |
| "merger", "acquisition", "M&A", "takeover", "buyout", "divestiture", | |
| "valuation", "billion", "million", "trillion", "unicorn", "decacorn", | |
| # Economic terms | |
| "economy", "economic", "economics", "macro", "micro", "fiscal", "monetary", | |
| "supply", "demand", "market forces", "competition", "competitive", "monopoly", | |
| "oligopoly", "antitrust", "deregulation", "growth", "decline", | |
| "recession", "depression", "recovery", "expansion", "contraction", "cycle", | |
| "inflation", "deflation", "stagflation", "hyperinflation", "CPI", "price", | |
| "GDP", "gross domestic product", "GNP", "productivity", "output", "input", | |
| # Banking and finance | |
| "finance", "financial", "bank", "banking", "commercial bank", "investment bank", | |
| "central bank", "Federal Reserve", "Fed", "ECB", "Bank of England", "BOJ", | |
| "interest rate", "prime rate", "discount rate", "basis point", "monetary policy", | |
| "quantitative easing", "tightening", "loosening", "credit", "lending", | |
| "borrowing", "loan", "mortgage", "consumer credit", "credit card", "debit card", | |
| "checking", "savings", "deposit", "withdrawal", "ATM", "branch", "online banking", | |
| # Currencies and payments | |
| "dollar", "euro", "pound", "yen", "yuan", "rupee", "ruble", "real", "peso", | |
| "currency", "money", "fiat", "exchange rate", "remittance", "transfer", | |
| "payment", "transaction", "wire", "ACH", "SWIFT", "clearing", "settlement", | |
| "cryptocurrency", "bitcoin", "ethereum", "blockchain", "fintech", "paytech", | |
| # Business operations | |
| "product", "service", "solution", "offering", "launch", "rollout", "release", | |
| "operation", "production", "manufacturing", "supply chain", "logistics", | |
| "procurement", "inventory", "distribution", "shipping", "delivery", | |
| "quality", "control", "assurance", "standard", "certification", | |
| # Marketing and sales | |
| "marketing", "advertise", "advertising", "campaign", "promotion", "publicity", | |
| "PR", "public relations", "brand", "branding", "identity", "image", "reputation", | |
| "sales", "selling", "deal", "transaction", "pipeline", "lead", "prospect", | |
| "customer", "client", "consumer", "buyer", "purchaser", "target market", | |
| "segment", "demographic", "psychographic", "B2B", "B2C", "retail", "wholesale", | |
| "price", "pricing", "discount", "premium", "luxury", "value", "bargain" | |
| ], | |
| "world": [ | |
| # General international terms | |
| "country", "nation", "state", "republic", "kingdom", "global", "international", | |
| "foreign", "world", "worldwide", "domestic", "abroad", "overseas", | |
| "developed", "developing", "industrialized", "emerging", "third world", | |
| "global south", "global north", "east", "west", "western", "eastern", | |
| "bilateral", "multilateral", "transnational", "multinational", "sovereignty", | |
| # Regions and continents | |
| "Europe", "European", "Asia", "Asian", "Africa", "African", "North America", | |
| "South America", "Latin America", "Australia", "Oceania", "Antarctica", | |
| "Middle East", "Central Asia", "Southeast Asia", "East Asia", "South Asia", | |
| "Eastern Europe", "Western Europe", "Northern Europe", "Southern Europe", | |
| "Mediterranean", "Scandinavia", "Nordic", "Baltic", "Balkans", "Caucasus", | |
| "Caribbean", "Central America", "South Pacific", "Polynesia", "Micronesia", | |
| # Major countries and regions | |
| "China", "Chinese", "Russia", "Russian", "India", "Indian", "Japan", "Japanese", | |
| "UK", "British", "England", "English", "Scotland", "Scottish", "Wales", "Welsh", | |
| "Germany", "German", "France", "French", "Italy", "Italian", "Spain", "Spanish", | |
| "Canada", "Canadian", "Brazil", "Brazilian", "Mexico", "Mexican", "Turkey", "Turkish", | |
| "United States", "US", "USA", "American", "Britain", "Korea", "Korean", | |
| "North Korea", "South Korea", "Saudi", "Saudi Arabia", "Saudi Arabian", | |
| "Iran", "Iranian", "Iraq", "Iraqi", "Israel", "Israeli", "Palestine", "Palestinian", | |
| "Egypt", "Egyptian", "Pakistan", "Pakistani", "Indonesia", "Indonesian", | |
| "Australia", "Australian", "New Zealand", "Nigeria", "Nigerian", "South Africa", | |
| "Argentina", "Argentinian", "Colombia", "Colombian", "Venezuela", "Venezuelan", | |
| "Ukraine", "Ukrainian", "Poland", "Polish", "Switzerland", "Swiss", | |
| "Netherlands", "Dutch", "Belgium", "Belgian", "Sweden", "Swedish", "Norway", "Norwegian", | |
| # International issues and topics | |
| "war", "conflict", "crisis", "tension", "dispute", "hostility", "peace", | |
| "peacekeeping", "ceasefire", "truce", "armistice", "treaty", "agreement", | |
| "compromise", "negotiation", "mediation", "resolution", "settlement", | |
| "refugee", "migrant", "asylum seeker", "displacement", "humanitarian", | |
| "border", "frontier", "territory", "territorial", "sovereignty", "jurisdiction", | |
| "terror", "terrorism", "extremism", "radicalism", "insurgency", "militant", | |
| "sanction", "embargo", "restriction", "isolation", "blockade", | |
| # International trade and economy | |
| "trade", "import", "export", "tariff", "duty", "quota", "subsidy", | |
| "protectionism", "free trade", "fair trade", "globalization", "trade war", | |
| "trade agreement", "trade deal", "trade deficit", "trade surplus", | |
| "supply chain", "outsourcing", "offshoring", "reshoring", "nearshoring", | |
| # Diplomacy and international relations | |
| "embassy", "consulate", "diplomatic", "diplomacy", "diplomat", "ambassador", | |
| "consul", "attaché", "envoy", "emissary", "delegation", "mission", | |
| "foreign policy", "international relations", "geopolitics", "geopolitical", | |
| "influence", "power", "superpower", "hegemony", "alliance", "coalition", | |
| "bloc", "axis", "sphere of influence", "buffer state", "proxy", | |
| # International organizations | |
| "UN", "United Nations", "EU", "European Union", "NATO", "NAFTA", "USMCA", | |
| "ASEAN", "OPEC", "Commonwealth", "Arab League", "African Union", "AU", | |
| "BRICS", "G7", "G20", "IMF", "World Bank", "WTO", "WHO", "UNESCO", | |
| "Security Council", "General Assembly", "International Court of Justice", | |
| # Travel and cultural exchange | |
| "visa", "passport", "immigration", "emigration", "migration", "travel", | |
| "tourism", "tourist", "visitor", "foreigner", "expatriate", "expat", | |
| "citizenship", "nationality", "dual citizen", "naturalization", | |
| "cultural", "tradition", "heritage", "indigenous", "native", "local", | |
| "language", "dialect", "translation", "interpreter", "cross-cultural", | |
| # Other | |
| "event" | |
| ], | |
| "sports": [ | |
| # General sports terms | |
| "game", "match", "tournament", "championship", "league", "cup", "Olympics", | |
| "olympic", "world cup", "competition", "contest", | |
| "sport", "sporting", "athletics", "physical", "play", "compete", "competition", | |
| "amateur", "professional", "pro", "preseason", "regular season", | |
| "postseason", "playoff", "final", "semifinal", "quarterfinal", "qualifying", | |
| # Team sports | |
| "football", "soccer", "American football", "rugby", "basketball", "baseball", | |
| "cricket", "hockey", "ice hockey", "field hockey", "volleyball", "handball", | |
| "water polo", "lacrosse", "ultimate frisbee", "netball", "kabaddi", | |
| "team", "club", "franchise", "squad", "roster", "lineup", "formation", | |
| "player", "coach", "manager", "trainer", "captain", "starter", "substitute", | |
| "bench", "draft", "trade", "free agent", "contract", "transfer", "loan", | |
| # Individual sports | |
| "tennis", "golf", "boxing", "wrestling", "martial arts", "MMA", "UFC", | |
| "athletics", "track and field", "swimming", "diving", "gymnastics", | |
| "skiing", "snowboarding", "skating", "figure skating", "speed skating", | |
| "cycling", "mountain biking", "BMX", "motorsport", "F1", "Formula 1", | |
| "NASCAR", "IndyCar", "MotoGP", "rally", "marathon", "triathlon", "decathlon", | |
| "archery", "shooting", "fencing", "equestrian", "rowing", "canoeing", "kayaking", | |
| "surfing", "skateboarding", "climbing", "bouldering", "weightlifting", | |
| # Scoring and results | |
| "score", "point", "goal", "touchdown", "basket", "run", "wicket", "try", | |
| "win", "lose", "draw", "tie", "defeat", "victory", "champion", "winner", | |
| "loser", "runner-up", "finalist", "semifinalist", "eliminated", "advance", | |
| "qualify", "record", "personal best", "world record", "Olympic record", | |
| "streak", "undefeated", "unbeaten", "perfect season", "comeback", | |
| # Performance and training | |
| "fitness", "training", "practice", "drill", "workout", "exercise", "regime", | |
| "conditioning", "strength", "endurance", "speed", "agility", "flexibility", | |
| "skill", "technique", "form", "style", "strategy", "tactic", "playbook", | |
| "offense", "defense", "attack", "counter", "press", "formation", | |
| "injury", "rehabilitation", "recovery", "physiotherapy", "sports medicine", | |
| # Sports infrastructure | |
| "stadium", "arena", "court", "field", "pitch", "rink", "pool", "track", | |
| "course", "gymnasium", "gym", "complex", "venue", "facility", "locker room", | |
| "dugout", "bench", "sideline", "grandstand", "spectator", "fan", "supporter", | |
| # Sports organizations and competitions | |
| "medal", "gold", "silver", "bronze", "podium", "Olympics", "Paralympic", | |
| "commonwealth games", "Asian games", "Pan American games", "world championship", | |
| "grand slam", "masters", "open", "invitational", "classic", "tour", "circuit", | |
| "IPL", "Indian Premier League", "MLB", "Major League Baseball", | |
| "NBA", "National Basketball Association", "NFL", "National Football League", | |
| "NHL", "National Hockey League", "FIFA", "UEFA", "ATP", "WTA", "ICC", | |
| "Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1", "MLS", | |
| "Champions League", "Europa League", "Super Bowl", "World Series", "Stanley Cup", | |
| "NCAA", "collegiate", "college", "university", "varsity", "intramural", | |
| # Sports media and business | |
| "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst", | |
| "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports", | |
| "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit", | |
| "ticket", "season ticket", "box seat", "premium", "concession", "vendor", | |
| # Sports media and business (continued) | |
| "broadcast", "coverage", "commentator", "announcer", "pundit", "analyst", | |
| "highlight", "replay", "sports network", "ESPN", "Sky Sports", "Fox Sports", | |
| "sponsorship", "endorsement", "advertisement", "merchandise", "jersey", "kit", | |
| "ticket", "season ticket", "box seat", "premium", "concession", "vendor" | |
| ], | |
| "entertainment": [ | |
| # Film and cinema | |
| "movie", "film", "cinema", "feature", "short film", "documentary", "animation", | |
| "blockbuster", "indie", "independent film", "foreign film", "box office", | |
| "screening", "premiere", "release", "theatrical", "stream", "streaming", | |
| "director", "producer", "screenwriter", "script", "screenplay", "adaptation", | |
| "cinematography", "cinematographer", "editing", "editor", "visual effects", | |
| "special effects", "CGI", "motion capture", "sound design", "soundtrack", | |
| "score", "composer", "scene", "shot", "take", "cut", "sequel", "prequel", | |
| "trilogy", "franchise", "universe", "reboot", "remake", "spin-off", | |
| "genre", "action", "comedy", "drama", "thriller", "horror", "sci-fi", | |
| "science fiction", "fantasy", "romance", "romantic comedy", "rom-com", | |
| "mystery", "crime", "western", "historical", "biographical", "biopic", | |
| # Television | |
| "TV", "television", "show", "episode", | |
| "finale", "midseason", "sitcom", "drama series", "miniseries", "limited series", | |
| "anthology", "reality TV", "game show", "talk show", "variety show", | |
| "network", "cable", "premium cable", "broadcast", "channel", "program", | |
| "primetime", "daytime", "syndication", "rerun", "renewed", "cancelled", | |
| "showrunner", "creator", "writer", "TV writer", "episode writer", "staff writer", | |
| # Performing arts | |
| "actor", "actress", "performer", "cast", "casting", "star", "co-star", | |
| "supporting", "lead", "protagonist", "antagonist", "villain", "hero", "anti-hero", | |
| "character", "role", "portrayal", "acting", "dialogue", | |
| "monologue", "line", "script", "improv", "improvisation", "stand-up", | |
| "comedian", "comic", "sketch", "theater", "theatre", "stage", "Broadway", | |
| "West End", "play", "musical", "opera", "ballet", "dance", "choreography", | |
| "production", "rehearsal", "audition", "understudy", "troupe", "ensemble", | |
| # Music | |
| "music", "song", "track", "single", "album", "EP", "LP", "record", | |
| "release", "drop", "artist", "musician", "singer", "vocalist", "band", | |
| "group", "duo", "trio", "soloist", "frontman", "frontwoman", "lead singer", | |
| "songwriter", "composer", "producer", "DJ", "rapper", "MC", "beatmaker", | |
| "guitarist", "bassist", "drummer", "pianist", "keyboardist", "violinist", | |
| "instrumentalist", "orchestra", "symphony", "philharmonic", "conductor", | |
| "genre", "rock", "pop", "hip-hop", "rap", "R&B", "soul", "funk", "jazz", | |
| "blues", "country", "folk", "electronic", "EDM", "dance", "techno", "house", | |
| "metal", "punk", "alternative", "indie", "classical", "reggae", "latin", | |
| "hit", "chart", "Billboard", "Grammy", "award-winning", "platinum", "gold", | |
| "concert", "tour", "gig", "show", "venue", "arena", | |
| "stadium", "festival", "Coachella", "Glastonbury", "Lollapalooza", "Bonnaroo", | |
| # Celebrity culture | |
| "celebrity", "star", "fame", "famous", "A-list", "B-list", "icon", "iconic", | |
| "superstar", "public figure", "household name", "stardom", "limelight", | |
| "popular", "popularity", "fan", "fanbase", "followers", "stan", "groupie", | |
| "paparazzi", "tabloid", "gossip", "rumor", "scandal", "controversy", | |
| "interview", "press conference", "red carpet", "premiere", "gala", "award show", | |
| # Awards and recognition | |
| "award", "nominee", "nomination", "winner", "recipient", "honor", "accolade", | |
| "Oscar", "Academy Award", "Emmy", "Grammy", "Tony", "Golden Globe", "BAFTA", | |
| "MTV Award", "People's Choice", "Critics' Choice", "SAG Award", "Billboard Award", | |
| "best actor", "best actress", "best director", "best picture", "best film", | |
| "best album", "best song", "hall of fame", "lifetime achievement", "legacy", | |
| # Media and publishing | |
| "book", "novel", "fiction", "non-fiction", "memoir", "biography", "autobiography", | |
| "bestseller", "bestselling", "author", "writer", "novelist", "literary", | |
| "literature", "publisher", "publishing", "imprint", "edition", "volume", | |
| "chapter", "page", "paragraph", "prose", "narrative", "plot", "storyline", | |
| "character", "protagonist", "antagonist", "setting", "theme", "genre", | |
| "mystery", "thriller", "romance", "sci-fi", "fantasy", "young adult", "YA", | |
| "comic", "comic book", "graphic novel", "manga", "anime", "cartoon", | |
| # Digital entertainment | |
| "streaming", "stream", "subscription", "platform", "service", "content", | |
| "Netflix", "Disney+", "Amazon Prime", "Hulu", "HBO", "HBO Max", "Apple TV+", | |
| "Peacock", "Paramount+", "YouTube", "YouTube Premium", "TikTok", "Instagram", | |
| "influencer", "content creator", "vlogger", "blogger", "podcaster", "podcast", | |
| "episode", "download", "subscriber", "follower", "like", "share", "viral", | |
| "trending", "binge-watch", "marathon", "spoiler", "recap", "review", "trailer", | |
| "teaser", "behind the scenes", "BTS", "exclusive", "original" | |
| ] | |
| } | |
| # Add domain-specific RSS feeds for different categories | |
| CATEGORY_SPECIFIC_FEEDS = { | |
| "ai": [ | |
| "https://www.artificialintelligence-news.com/feed/", | |
| "https://www.deeplearningweekly.com/feed", | |
| "https://openai.com/news/rss.xml", | |
| "https://aiweekly.co/issues.rss", | |
| "https://news.mit.edu/topic/mitartificial-intelligence2-rss.xml", | |
| "https://ai.stanford.edu/blog/feed.xml", | |
| "https://feeds.feedburner.com/blogspot/gJZg", | |
| "https://blog.google/technology/ai/rss/", | |
| "https://deepmind.google/blog/rss.xml", | |
| "https://blog.tensorflow.org/feeds/posts/default", | |
| "https://aws.amazon.com/blogs/machine-learning/feed/", | |
| "https://machinelearning.apple.com/rss.xml", | |
| "https://msrc.microsoft.com/blog/feed", | |
| "https://learn.microsoft.com/en-us/archive/blogs/machinelearning/feed.xml", | |
| "https://rss.arxiv.org/rss/cs.LG" | |
| ], | |
| "science": [ | |
| "https://www.science.org/rss/news_current.xml", | |
| "https://www.nature.com/nature.rss", | |
| "http://rss.sciam.com/basic-science", | |
| "http://rss.sciam.com/ScientificAmerican-Global", | |
| "https://www.newscientist.com/feed/home/?cmpid=RSS|NSNS-Home", | |
| "https://phys.org/rss-feed/" | |
| ], | |
| "technology": [ | |
| "https://www.wired.com/feed/category/business/latest/rss", | |
| "https://techcrunch.com/feed/", | |
| "https://www.technologyreview.com/feed/", | |
| "https://arstechnica.com/feed/", | |
| "https://www.theverge.com/rss/index.xml", | |
| "https://news.ycombinator.com/rss" | |
| ], | |
| "politics": [ | |
| "https://feeds.washingtonpost.com/rss/politics", | |
| "https://rss.nytimes.com/services/xml/rss/nyt/Politics.xml", | |
| "https://feeds.bbci.co.uk/news/politics/rss.xml", | |
| "https://www.politico.com/rss/politicopicks.xml", | |
| "https://www.realclearpolitics.com/index.xml" | |
| ], | |
| "business": [ | |
| "https://www.ft.com/rss/home", | |
| "https://feeds.bloomberg.com/markets/news.rss", | |
| "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml", | |
| "https://feeds.washingtonpost.com/rss/business", | |
| "https://www.entrepreneur.com/latest.rss", | |
| "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10001147", | |
| "https://feeds.content.dowjones.io/public/rss/WSJcomUSBusiness", | |
| "https://feeds.a.dj.com/rss/RSSMarketsMain.xml" | |
| ], | |
| "world": [ | |
| "https://feeds.bbci.co.uk/news/world/rss.xml", | |
| "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", | |
| "https://www.aljazeera.com/xml/rss/all.xml", | |
| "https://feeds.washingtonpost.com/rss/world", | |
| "http://rss.cnn.com/rss/cnn_world.rss" | |
| ], | |
| "sports": [ | |
| "https://www.espn.com/espn/rss/news", | |
| "https://www.cbssports.com/rss/headlines/", | |
| "https://www.espncricinfo.com/rss/content/story/feeds/0.xml", | |
| "https://api.foxsports.com/v1/rss", | |
| "https://www.sportingnews.com/us/rss", | |
| "https://www.theguardian.com/sport/rss", | |
| ], | |
| "entertainment": [ | |
| "https://www.hollywoodreporter.com/feed/", | |
| "https://variety.com/feed/", | |
| "https://www.eonline.com/syndication/feeds/rssfeeds/topstories.xml", | |
| "https://www.rollingstone.com/feed/", | |
| "https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml" | |
| ], | |
| "fact_checking": [ | |
| "https://www.snopes.com/feed/", | |
| "https://www.politifact.com/rss/all/", | |
| "https://www.factcheck.org/feed/", | |
| "https://leadstories.com/atom.xml", | |
| "https://fullfact.org/feed/all/", | |
| "https://www.truthorfiction.com/feed/" | |
| ] | |
| } | |
| def detect_claim_category(claim: str) -> Tuple[str, float]: | |
| """ | |
| Detect the most likely category of a claim and its confidence score | |
| This function analyzes the claim text and matches it against category-specific keywords | |
| to determine the most likely category for the claim (AI, science, politics, etc.). | |
| Args: | |
| claim (str): The claim text | |
| Returns: | |
| tuple: (category_name, confidence_score) | |
| """ | |
| if not claim: | |
| return "general", 0.3 | |
| # Lowercase for better matching | |
| claim_lower = claim.lower() | |
| # Count matches for each category | |
| category_scores = {} | |
| for category, keywords in CLAIM_CATEGORIES.items(): | |
| # Count how many keywords from this category appear in the claim | |
| matches = sum(1 for keyword in keywords if keyword.lower() in claim_lower) | |
| # Calculate a simple score based on matches | |
| if matches > 0: | |
| # Calculate a more significant score based on number of matches | |
| score = min(0.9, 0.3 + (matches * 0.1)) # Base 0.3 + 0.1 per match, max 0.9 | |
| category_scores[category] = score | |
| # Find category with highest score | |
| if not category_scores: | |
| return "general", 0.3 | |
| top_category = max(category_scores.items(), key=lambda x: x[1]) | |
| category_name, confidence = top_category | |
| # If the top score is too low, return general | |
| if confidence < 0.3: | |
| return "general", 0.3 | |
| return category_name, confidence | |
| def get_category_specific_rss_feeds(category: str, max_feeds: int = 5) -> List[str]: | |
| """ | |
| Get a list of RSS feeds specific to a category | |
| This function returns a subset of category-specific RSS feeds to use | |
| for evidence gathering. | |
| Args: | |
| category (str): The claim category | |
| max_feeds (int): Maximum number of feeds to return | |
| Returns: | |
| list: List of RSS feed URLs | |
| """ | |
| # Get category-specific feeds | |
| category_feeds = CATEGORY_SPECIFIC_FEEDS.get(category, []) | |
| # Limit to max_feeds | |
| return category_feeds[:min(max_feeds, len(category_feeds))] | |
| def get_fallback_category(category: str) -> Optional[str]: | |
| """ | |
| Get a fallback category for a given category when insufficient evidence is found | |
| This function determines which alternative category to use when the | |
| primary category doesn't yield sufficient evidence. For example, | |
| AI claims fall back to technology sources. | |
| Args: | |
| category (str): The primary category to find a fallback for | |
| Returns: | |
| str or None: Fallback category name or None if no fallback exists | |
| """ | |
| # Define fallback categories for specific categories | |
| fallbacks = { | |
| "ai": "technology", # For AI claims, use technology as fallback | |
| # Other categories fall back to default RSS feeds, handled in retrieve_combined_evidence | |
| } | |
| return fallbacks.get(category) |