Spaces:

habulaj
/

newapi-clone

Sleeping

File size: 4,336 Bytes

4ffe0a9

import os
import re
import httpx
from typing import List, Dict
from bs4 import BeautifulSoup
from fastapi import APIRouter, HTTPException

router = APIRouter()

# 🎯 IMDb GraphQL
GRAPHQL_URL = "https://api.graphql.imdb.com"
HEADERS = {"Content-Type": "application/json"}

QUERY = """
query GetNews($first: Int!) {
  movieNews: news(first: $first, category: MOVIE) {
    edges {
      node {
        id
        articleTitle { plainText }
        externalUrl
        date
        text { plaidHtml }
        image { url }
      }
    }
  }
  tvNews: news(first: $first, category: TV) {
    edges {
      node {
        id
        articleTitle { plainText }
        externalUrl
        date
        text { plaidHtml }
        image { url }
      }
    }
  }
}
"""

# 🔧 Supabase Config
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co"
SUPABASE_KEY = os.getenv("SUPA_KEY")
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")

if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
    raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")

SUPABASE_HEADERS = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
    "Content-Type": "application/json"
}

SUPABASE_ROLE_HEADERS = {
    "apikey": SUPABASE_ROLE_KEY,
    "Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
    "Content-Type": "application/json"
}

# 🧼 HTML Cleanup
def clean_html(raw_html: str) -> str:
    text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+([.,;:!?])", r"\1", text)
    text = re.sub(r"\(\s+", "(", text)
    text = re.sub(r"\s+\)", ")", text)
    text = re.sub(r"\[\s+", "[", text)
    text = re.sub(r"\s+\]", "]", text)
    text = re.sub(r"\{\s+", "{", text)
    text = re.sub(r"\s+\}", "}", text)
    return text.strip()

# 🚀 Endpoint principal
@router.get("/news")
async def get_news(first: int = 20) -> List[Dict]:
    payload = {
        "query": QUERY,
        "variables": {"first": first}
    }

    async with httpx.AsyncClient(timeout=10.0) as client:
        # Pega notícias do IMDb
        response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)

        if response.status_code != 200:
            raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")

        data = response.json().get("data")
        if not data:
            raise HTTPException(status_code=500, detail="Resposta inválida da API")

        combined = []

        for category_key in ["movieNews", "tvNews"]:
            for edge in data.get(category_key, {}).get("edges", []):
                node = edge.get("node", {})
                image_data = node.get("image")
                combined.append({
                    "news_id": node.get("id"),
                    "title": node.get("articleTitle", {}).get("plainText"),
                    "url": node.get("externalUrl"),
                    "date": node.get("date"),
                    "text": clean_html(node.get("text", {}).get("plaidHtml")),
                    "image": image_data.get("url") if image_data else None,
                    "category": category_key.replace("News", "").upper()
                })

        # 📌 Verifica quais IDs já existem no Supabase
        all_ids = [item["news_id"] for item in combined]

        existing_ids = []
        ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)]  # evita URL muito grande

        for chunk in ids_chunks:
            query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
            url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
            r = await client.get(url, headers=SUPABASE_HEADERS)
            if r.status_code == 200:
                existing_ids.extend([item["news_id"] for item in r.json()])

        # 🔎 Filtra apenas as novas notícias
        new_entries = [item for item in combined if item["news_id"] not in existing_ids]

        # 🧾 Insere novas notícias (em lote)
        if new_entries:
            insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
            await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)

        # 🔃 Ordena por data
        combined.sort(key=lambda x: x.get("date"), reverse=True)
        return combined