Spaces:

habulaj
/

newapi-clone3

Paused

App Files Files Community

habulaj commited on Sep 1

Commit

4aae173

verified ·

1 Parent(s): cce5dc2

Update routers/search.py

Browse files

Files changed (1) hide show

routers/search.py +90 -230

routers/search.py CHANGED Viewed

@@ -4,7 +4,7 @@ import httpx
 import os
 import json
 import re
-from urllib.parse import unquote, urlencode
 from PIL import Image
 import io
 import asyncio
@@ -15,11 +15,6 @@ from functools import lru_cache
 import aiofiles
 from concurrent.futures import ThreadPoolExecutor
 import time
-import logging
-import random
-# Desabilita logs do httpx
-logging.getLogger("httpx").setLevel(logging.WARNING)
 router = APIRouter()
@@ -33,15 +28,6 @@ thumbnail_executor = ThreadPoolExecutor(
 _url_cache = {}
 _cache_max_size = 1000
-# User agents rotativos mais realistas
-USER_AGENTS = [
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"
-]
 @router.get("/search")
 async def search(
     q: str = Query(..., description="Termo de pesquisa para imagens"),
@@ -49,115 +35,52 @@ async def search(
     include_thumbnails: bool = Query(True, description="Incluir miniaturas base64 nas respostas")
 ):
     """
-    Busca imagens no Google Imagens com máxima performance e melhor evasão de bloqueios
     """
     start_time = time.time()
-    # Múltiplas estratégias de busca
-    search_strategies = [
-        {
-            "url": "https://www.google.com/search",
-            "params": {
-                "tbm": "isch",
-                "q": q,
-                "hl": "pt-BR",
-                "gl": "br",
-                "safe": "off",
-                "tbs": "isz:l"  # Imagens grandes
-            }
-        },
-        {
-            "url": "https://www.google.com/search",
-            "params": {
-                "tbm": "isch",
-                "q": q,
-                "hl": "en",
-                "gl": "us",
-                "safe": "off",
-                "tbs": "isz:lt,islt:4mp"  # Imagens > 4MP
-            }
-        }
-    ]
-    # Headers mais realistas com rotação
-    def get_random_headers():
-        return {
-            "User-Agent": random.choice(USER_AGENTS),
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8",
-            "Accept-Encoding": "gzip, deflate, br",
-            "Connection": "keep-alive",
-            "Upgrade-Insecure-Requests": "1",
-            "Sec-Fetch-Dest": "document",
-            "Sec-Fetch-Mode": "navigate",
-            "Sec-Fetch-Site": "none",
-            "Sec-Fetch-User": "?1",
-            "Cache-Control": "max-age=0",
-            "DNT": "1"
-        }
-    all_images = []
     try:
-        # Tenta múltiplas estratégias
-        for i, strategy in enumerate(search_strategies):
-            try:
-                # Cliente HTTP com configurações anti-detecção
-                async with httpx.AsyncClient(
-                    timeout=30.0,
-                    follow_redirects=True,  # Segue redirecionamentos
-                    limits=httpx.Limits(max_redirects=3)
-                ) as client:
-                    # Headers únicos para cada tentativa
-                    headers = get_random_headers()
-                    # Adiciona delay aleatório entre tentativas
-                    if i > 0:
-                        await asyncio.sleep(random.uniform(1, 3))
-                    response = await client.get(
-                        strategy["url"],
-                        params=strategy["params"],
-                        headers=headers
-                    )
-                    # Verifica se foi redirecionado para captcha ou bloqueio
-                    if response.status_code == 200 and "captcha" not in response.text.lower():
-                        images = extract_images_from_response_optimized(response.text)
-                        if images:
-                            all_images.extend(images)
-                            print(f"Estratégia {i+1}: {len(images)} imagens encontradas")
-                        break  # Se funcionou, para aqui
-                    else:
-                        print(f"Estratégia {i+1} falhou: Status {response.status_code}")
-                        continue
-            except Exception as e:
-                print(f"Erro na estratégia {i+1}: {str(e)}")
-                continue
-        # Se nenhuma estratégia funcionou, tenta API alternativa
-        if not all_images:
-            print("Tentando método alternativo...")
-            all_images = await search_alternative_method(q)
-        if not all_images:
-            raise HTTPException(
-                status_code=503,
-                detail="Não foi possível acessar o Google Imagens. Tente novamente em alguns minutos."
-            )
-        # Remove duplicatas
-        unique_images = []
-        seen_urls = set()
-        for img in all_images:
-            if img.get('url') not in seen_urls:
-                seen_urls.add(img.get('url'))
-                unique_images.append(img)
         # Processamento paralelo massivo
-        enriched_images = await enrich_images_ultra_fast(unique_images[:100], include_thumbnails)
         # Filtragem rápida
         valid_images = [
@@ -165,11 +88,32 @@ async def search(
             if img.get('width', 0) >= min_width and img.get('height', 0) > 0
         ]
         # Ordenação e limitação
         valid_images.sort(key=lambda x: x.get('width', 0), reverse=True)
         final_images = valid_images[:50]
         total_time = time.time() - start_time
         return JSONResponse(content={
             "query": q,
@@ -180,51 +124,12 @@ async def search(
             "images": final_images
         })
-    except HTTPException:
-        raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Erro ao executar a busca: {str(e)}")
-async def search_alternative_method(query: str) -> List[Dict]:
-    """
-    Método alternativo usando diferentes endpoints
-    """
-    try:
-        # Tenta o endpoint de imagens do Bing como fallback
-        bing_url = "https://www.bing.com/images/search"
-        params = {
-            "q": query,
-            "FORM": "HDRSC2",
-            "first": "1",
-            "tsc": "ImageHoverTitle"
-        }
-        headers = {
-            "User-Agent": random.choice(USER_AGENTS),
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            "Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3",
-            "Connection": "keep-alive",
-        }
-        async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
-            response = await client.get(bing_url, params=params, headers=headers)
-            if response.status_code == 200:
-                # Extrai URLs de imagens do Bing
-                image_urls = re.findall(
-                    r'"murl":"([^"]+)"',
-                    response.text
-                )
-                return [{"url": url.replace("\\u0026", "&"), "width": None, "height": None}
-                       for url in image_urls[:50]]
-    except:
-        pass
-    return []
 @lru_cache(maxsize=500)
 def clean_wikimedia_url_cached(url: str) -> str:
     """
@@ -248,40 +153,24 @@ def clean_wikimedia_url_cached(url: str) -> str:
 def extract_images_from_response_optimized(response_text: str) -> List[Dict]:
     """
-    Extração ultra-otimizada usando múltiplos padrões
     """
-    images = []
-    seen_urls = set()
-    # Múltiplos padrões para diferentes estruturas do Google
-    patterns = [
-        # Padrão principal
-        re.compile(r'"ou":"([^"]+)"', re.IGNORECASE),
-        # URLs diretas
-        re.compile(r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)(?:\?[^\s"\'<>]*)?', re.IGNORECASE),
-        # URLs escapadas
-        re.compile(r'\\u003d([^\\u0026]+)\\u0026', re.IGNORECASE),
-        # Padrão alternativo
-        re.compile(r'"(https?://[^"]*\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE)
-    ]
-    for pattern in patterns:
-        matches = pattern.findall(response_text)
-        for match in matches[:100]:  # Limita por padrão
-            # Limpa URL
-            clean_url = (match.replace('\\u003d', '=')
-                           .replace('\\u0026', '&')
-                           .replace('\\/', '/')
-                           .replace('\\\\', ''))
-            clean_url = clean_wikimedia_url_cached(clean_url)
-            if (clean_url not in seen_urls and
-                any(ext in clean_url.lower() for ext in ['.jpg', '.jpeg', '.png', '.webp']) and
-                not any(skip in clean_url.lower() for skip in ['favicon', 'logo', 'icon'])):
-                seen_urls.add(clean_url)
-                images.append({"url": clean_url, "width": None, "height": None})
     return images
@@ -377,11 +266,9 @@ async def download_and_process_image(session: httpx.AsyncClient, url: str, inclu
     clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/')
     headers = {
-        'User-Agent': random.choice(USER_AGENTS),
-        'Accept': 'image/*,*/*;q=0.8',
-        'Accept-Encoding': 'gzip, deflate',
-        'Connection': 'keep-alive',
-        'DNT': '1'
     }
     width, height, thumbnail_b64 = None, None, None
@@ -469,28 +356,27 @@ async def enrich_images_ultra_fast(images: List[Dict], include_thumbnails: bool
     if not images:
         return []
-    # Configuração HTTP otimizada
     connector = httpx.AsyncClient(
         timeout=httpx.Timeout(10.0),
         limits=httpx.Limits(
-            max_keepalive_connections=100,
-            max_connections=150,
-            keepalive_expiry=30.0
         ),
-        follow_redirects=True,
-        http2=False
     )
-    # Semáforo mais conservador para evitar rate limiting
-    semaphore = asyncio.Semaphore(25)
     async def process_single_image(image_data):
         async with semaphore:
-            # Delay pequeno aleatório para evitar detecção
-            await asyncio.sleep(random.uniform(0.1, 0.3))
             return await download_and_process_image(connector, image_data["url"], include_thumbnails)
     try:
         # Cria todas as tasks de uma vez
         tasks = [process_single_image(img) for img in images]
@@ -503,9 +389,13 @@ async def enrich_images_ultra_fast(images: List[Dict], include_thumbnails: bool
             if not isinstance(result, Exception) and result.get('width') and result.get('height'):
                 valid_results.append(result)
         return valid_results
     except Exception as e:
         return []
     finally:
         await connector.aclose()
@@ -521,7 +411,7 @@ async def get_thumbnail_fast(
     Obtém miniatura ultra-rápida de uma imagem específica
     """
     try:
-        async with httpx.AsyncClient(timeout=8.0, follow_redirects=True) as client:
             result = await download_and_process_image(client, url, True)
             if result.get('thumbnail'):
@@ -538,36 +428,6 @@ async def get_thumbnail_fast(
         raise HTTPException(status_code=500, detail=f"Erro: {str(e)}")
-# Endpoint de diagnóstico
-@router.get("/status")
-async def status_check():
-    """
-    Verifica se o serviço está funcionando e se consegue acessar o Google
-    """
-    try:
-        async with httpx.AsyncClient(timeout=10.0) as client:
-            headers = {
-                "User-Agent": random.choice(USER_AGENTS),
-                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-            }
-            response = await client.get("https://www.google.com", headers=headers)
-            return JSONResponse(content={
-                "status": "ok",
-                "google_accessible": response.status_code == 200,
-                "cache_size": len(_url_cache),
-                "timestamp": time.time()
-            })
-    except Exception as e:
-        return JSONResponse(content={
-            "status": "error",
-            "error": str(e),
-            "cache_size": len(_url_cache),
-            "timestamp": time.time()
-        })
 # Cleanup do executor na finalização
 import atexit
 atexit.register(lambda: thumbnail_executor.shutdown(wait=False))

 import os
 import json
 import re
+from urllib.parse import unquote
 from PIL import Image
 import io
 import asyncio
 import aiofiles
 from concurrent.futures import ThreadPoolExecutor
 import time
 router = APIRouter()
 _url_cache = {}
 _cache_max_size = 1000
 @router.get("/search")
 async def search(
     q: str = Query(..., description="Termo de pesquisa para imagens"),
     include_thumbnails: bool = Query(True, description="Incluir miniaturas base64 nas respostas")
 ):
     """
+    Busca imagens no Google Imagens com máxima performance
     """
     start_time = time.time()
+    # URL do Google Imagens com parâmetros para imagens grandes
+    google_images_url = "http://www.google.com/search"
+    params = {
+        "tbm": "isch",
+        "q": q,
+        "start": 0,
+        "sa": "N",
+        "asearch": "arc",
+        "cs": "1",
+        "tbs": "isz:l",
+        "async": f"arc_id:srp_GgSMaOPQOtL_5OUPvbSTOQ_110,ffilt:all,ve_name:MoreResultsContainer,inf:1,_id:arc-srp_GgSMaOPQOtL_5OUPvbSTOQ_110,_pms:s,_fmt:pc"
+    }
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+        "Referer": "https://www.google.com/"
+    }
     try:
+        # Busca no Google (rápida)
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.get(google_images_url, params=params, headers=headers)
+        if response.status_code != 200:
+            raise HTTPException(status_code=response.status_code, detail="Erro ao buscar no Google Imagens")
+        print(f"Google respondeu em {time.time() - start_time:.2f}s")
+        extract_start = time.time()
+        # Extração otimizada
+        images = extract_images_from_response_optimized(response.text)
+        print(f"Extração concluída em {time.time() - extract_start:.2f}s - {len(images)} URLs")
         # Processamento paralelo massivo
+        processing_start = time.time()
+        enriched_images = await enrich_images_ultra_fast(images, include_thumbnails)
+        print(f"Processamento concluído em {time.time() - processing_start:.2f}s")
         # Filtragem rápida
         valid_images = [
             if img.get('width', 0) >= min_width and img.get('height', 0) > 0
         ]
+        # Se poucos resultados, busca adicional em paralelo
+        if len(valid_images) < 20:
+            params["tbs"] = "isz:lt,islt:4mp"
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                response2 = await client.get(google_images_url, params=params, headers=headers)
+            if response2.status_code == 200:
+                additional_images = extract_images_from_response_optimized(response2.text)
+                additional_enriched = await enrich_images_ultra_fast(additional_images, include_thumbnails)
+                # Merge rápido com set para deduplicação
+                seen_urls = {img.get('url') for img in valid_images}
+                for img in additional_enriched:
+                    if (img.get('url') not in seen_urls
+                        and img.get('width', 0) >= min_width
+                        and img.get('height', 0) > 0):
+                        valid_images.append(img)
+                        seen_urls.add(img.get('url'))
         # Ordenação e limitação
         valid_images.sort(key=lambda x: x.get('width', 0), reverse=True)
         final_images = valid_images[:50]
         total_time = time.time() - start_time
+        print(f"TEMPO TOTAL: {total_time:.2f}s - {len(final_images)} imagens finais")
         return JSONResponse(content={
             "query": q,
             "images": final_images
         })
+    except httpx.TimeoutException:
+        raise HTTPException(status_code=408, detail="Timeout na requisição ao Google")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Erro ao executar a busca: {str(e)}")
 @lru_cache(maxsize=500)
 def clean_wikimedia_url_cached(url: str) -> str:
     """
 def extract_images_from_response_optimized(response_text: str) -> List[Dict]:
     """
+    Extração ultra-otimizada usando regex compilado e processamento em lote
     """
+    # Regex compilado (mais rápido)
+    pattern = re.compile(r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)\b', re.IGNORECASE)
+    # Extração em uma única passada
+    image_urls = pattern.findall(response_text)
+    # Deduplicação com set (O(1) lookup)
+    seen_urls = set()
+    images = []
+    # Processa URLs em lote
+    for url in image_urls[:200]:  # Aumentado para compensar filtragem
+        cleaned_url = clean_wikimedia_url_cached(url)
+        if cleaned_url not in seen_urls:
+            seen_urls.add(cleaned_url)
+            images.append({"url": cleaned_url, "width": None, "height": None})
     return images
     clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/')
     headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        'Accept': 'image/*',
+        'Connection': 'close'
     }
     width, height, thumbnail_b64 = None, None, None
     if not images:
         return []
+    # Configuração HTTP2 otimizada para máxima concorrência
     connector = httpx.AsyncClient(
         timeout=httpx.Timeout(10.0),
         limits=httpx.Limits(
+            max_keepalive_connections=100,  # Muito mais conexões
+            max_connections=150,            # Pool maior
+            keepalive_expiry=30.0          # Mantém conexões por mais tempo
         ),
+        http2=False  # HTTP/1.1 ainda é mais rápido para muitas conexões pequenas
     )
+    # Semáforo mais agressivo
+    semaphore = asyncio.Semaphore(30)  # Muito mais concorrência
     async def process_single_image(image_data):
         async with semaphore:
             return await download_and_process_image(connector, image_data["url"], include_thumbnails)
     try:
+        print(f"Iniciando processamento ultra-paralelo de {len(images)} imagens...")
         # Cria todas as tasks de uma vez
         tasks = [process_single_image(img) for img in images]
             if not isinstance(result, Exception) and result.get('width') and result.get('height'):
                 valid_results.append(result)
+        success_rate = len(valid_results) / len(images) * 100
+        print(f"Processamento concluído: {len(valid_results)}/{len(images)} ({success_rate:.1f}% sucesso)")
         return valid_results
     except Exception as e:
+        print(f"Erro no processamento ultra-rápido: {e}")
         return []
     finally:
         await connector.aclose()
     Obtém miniatura ultra-rápida de uma imagem específica
     """
     try:
+        async with httpx.AsyncClient(timeout=8.0) as client:
             result = await download_and_process_image(client, url, True)
             if result.get('thumbnail'):
         raise HTTPException(status_code=500, detail=f"Erro: {str(e)}")
 # Cleanup do executor na finalização
 import atexit
 atexit.register(lambda: thumbnail_executor.shutdown(wait=False))