File size: 4,336 Bytes
4ffe0a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import re
import httpx
from typing import List, Dict
from bs4 import BeautifulSoup
from fastapi import APIRouter, HTTPException

router = APIRouter()

# 🎯 IMDb GraphQL
GRAPHQL_URL = "https://api.graphql.imdb.com"
HEADERS = {"Content-Type": "application/json"}

QUERY = """
query GetNews($first: Int!) {
  movieNews: news(first: $first, category: MOVIE) {
    edges {
      node {
        id
        articleTitle { plainText }
        externalUrl
        date
        text { plaidHtml }
        image { url }
      }
    }
  }
  tvNews: news(first: $first, category: TV) {
    edges {
      node {
        id
        articleTitle { plainText }
        externalUrl
        date
        text { plaidHtml }
        image { url }
      }
    }
  }
}
"""

# 🔧 Supabase Config
SUPABASE_URL = "https://iiwbixdrrhejkthxygak.supabase.co"
SUPABASE_KEY = os.getenv("SUPA_KEY")
SUPABASE_ROLE_KEY = os.getenv("SUPA_SERVICE_KEY")

if not SUPABASE_KEY or not SUPABASE_ROLE_KEY:
    raise ValueError("❌ SUPA_KEY or SUPA_SERVICE_KEY not set in environment!")

SUPABASE_HEADERS = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
    "Content-Type": "application/json"
}

SUPABASE_ROLE_HEADERS = {
    "apikey": SUPABASE_ROLE_KEY,
    "Authorization": f"Bearer {SUPABASE_ROLE_KEY}",
    "Content-Type": "application/json"
}

# 🧼 HTML Cleanup
def clean_html(raw_html: str) -> str:
    text = BeautifulSoup(raw_html or "", "html.parser").get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+([.,;:!?])", r"\1", text)
    text = re.sub(r"\(\s+", "(", text)
    text = re.sub(r"\s+\)", ")", text)
    text = re.sub(r"\[\s+", "[", text)
    text = re.sub(r"\s+\]", "]", text)
    text = re.sub(r"\{\s+", "{", text)
    text = re.sub(r"\s+\}", "}", text)
    return text.strip()

# 🚀 Endpoint principal
@router.get("/news")
async def get_news(first: int = 20) -> List[Dict]:
    payload = {
        "query": QUERY,
        "variables": {"first": first}
    }

    async with httpx.AsyncClient(timeout=10.0) as client:
        # Pega notícias do IMDb
        response = await client.post(GRAPHQL_URL, headers=HEADERS, json=payload)

        if response.status_code != 200:
            raise HTTPException(status_code=502, detail="Erro ao acessar a API do IMDb")

        data = response.json().get("data")
        if not data:
            raise HTTPException(status_code=500, detail="Resposta inválida da API")

        combined = []

        for category_key in ["movieNews", "tvNews"]:
            for edge in data.get(category_key, {}).get("edges", []):
                node = edge.get("node", {})
                image_data = node.get("image")
                combined.append({
                    "news_id": node.get("id"),
                    "title": node.get("articleTitle", {}).get("plainText"),
                    "url": node.get("externalUrl"),
                    "date": node.get("date"),
                    "text": clean_html(node.get("text", {}).get("plaidHtml")),
                    "image": image_data.get("url") if image_data else None,
                    "category": category_key.replace("News", "").upper()
                })

        # 📌 Verifica quais IDs já existem no Supabase
        all_ids = [item["news_id"] for item in combined]

        existing_ids = []
        ids_chunks = [all_ids[i:i + 1000] for i in range(0, len(all_ids), 1000)]  # evita URL muito grande

        for chunk in ids_chunks:
            query_ids = ",".join([f"\"{nid}\"" for nid in chunk])
            url = f"{SUPABASE_URL}/rest/v1/news_extraction?select=news_id&news_id=in.({query_ids})"
            r = await client.get(url, headers=SUPABASE_HEADERS)
            if r.status_code == 200:
                existing_ids.extend([item["news_id"] for item in r.json()])

        # 🔎 Filtra apenas as novas notícias
        new_entries = [item for item in combined if item["news_id"] not in existing_ids]

        # 🧾 Insere novas notícias (em lote)
        if new_entries:
            insert_url = f"{SUPABASE_URL}/rest/v1/news_extraction"
            await client.post(insert_url, headers=SUPABASE_ROLE_HEADERS, json=new_entries)

        # 🔃 Ordena por data
        combined.sort(key=lambda x: x.get("date"), reverse=True)
        return combined