habulaj commited on
Commit
4aae173
·
verified ·
1 Parent(s): cce5dc2

Update routers/search.py

Browse files
Files changed (1) hide show
  1. routers/search.py +90 -230
routers/search.py CHANGED
@@ -4,7 +4,7 @@ import httpx
4
  import os
5
  import json
6
  import re
7
- from urllib.parse import unquote, urlencode
8
  from PIL import Image
9
  import io
10
  import asyncio
@@ -15,11 +15,6 @@ from functools import lru_cache
15
  import aiofiles
16
  from concurrent.futures import ThreadPoolExecutor
17
  import time
18
- import logging
19
- import random
20
-
21
- # Desabilita logs do httpx
22
- logging.getLogger("httpx").setLevel(logging.WARNING)
23
 
24
  router = APIRouter()
25
 
@@ -33,15 +28,6 @@ thumbnail_executor = ThreadPoolExecutor(
33
  _url_cache = {}
34
  _cache_max_size = 1000
35
 
36
- # User agents rotativos mais realistas
37
- USER_AGENTS = [
38
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
39
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
40
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
41
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
42
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"
43
- ]
44
-
45
  @router.get("/search")
46
  async def search(
47
  q: str = Query(..., description="Termo de pesquisa para imagens"),
@@ -49,115 +35,52 @@ async def search(
49
  include_thumbnails: bool = Query(True, description="Incluir miniaturas base64 nas respostas")
50
  ):
51
  """
52
- Busca imagens no Google Imagens com máxima performance e melhor evasão de bloqueios
53
  """
54
  start_time = time.time()
55
 
56
- # Múltiplas estratégias de busca
57
- search_strategies = [
58
- {
59
- "url": "https://www.google.com/search",
60
- "params": {
61
- "tbm": "isch",
62
- "q": q,
63
- "hl": "pt-BR",
64
- "gl": "br",
65
- "safe": "off",
66
- "tbs": "isz:l" # Imagens grandes
67
- }
68
- },
69
- {
70
- "url": "https://www.google.com/search",
71
- "params": {
72
- "tbm": "isch",
73
- "q": q,
74
- "hl": "en",
75
- "gl": "us",
76
- "safe": "off",
77
- "tbs": "isz:lt,islt:4mp" # Imagens > 4MP
78
- }
79
- }
80
- ]
81
 
82
- # Headers mais realistas com rotação
83
- def get_random_headers():
84
- return {
85
- "User-Agent": random.choice(USER_AGENTS),
86
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
87
- "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8",
88
- "Accept-Encoding": "gzip, deflate, br",
89
- "Connection": "keep-alive",
90
- "Upgrade-Insecure-Requests": "1",
91
- "Sec-Fetch-Dest": "document",
92
- "Sec-Fetch-Mode": "navigate",
93
- "Sec-Fetch-Site": "none",
94
- "Sec-Fetch-User": "?1",
95
- "Cache-Control": "max-age=0",
96
- "DNT": "1"
97
- }
98
 
99
- all_images = []
 
 
 
 
 
 
 
100
 
101
  try:
102
- # Tenta múltiplas estratégias
103
- for i, strategy in enumerate(search_strategies):
104
- try:
105
- # Cliente HTTP com configurações anti-detecção
106
- async with httpx.AsyncClient(
107
- timeout=30.0,
108
- follow_redirects=True, # Segue redirecionamentos
109
- limits=httpx.Limits(max_redirects=3)
110
- ) as client:
111
-
112
- # Headers únicos para cada tentativa
113
- headers = get_random_headers()
114
-
115
- # Adiciona delay aleatório entre tentativas
116
- if i > 0:
117
- await asyncio.sleep(random.uniform(1, 3))
118
-
119
- response = await client.get(
120
- strategy["url"],
121
- params=strategy["params"],
122
- headers=headers
123
- )
124
-
125
- # Verifica se foi redirecionado para captcha ou bloqueio
126
- if response.status_code == 200 and "captcha" not in response.text.lower():
127
- images = extract_images_from_response_optimized(response.text)
128
- if images:
129
- all_images.extend(images)
130
- print(f"Estratégia {i+1}: {len(images)} imagens encontradas")
131
- break # Se funcionou, para aqui
132
- else:
133
- print(f"Estratégia {i+1} falhou: Status {response.status_code}")
134
- continue
135
-
136
- except Exception as e:
137
- print(f"Erro na estratégia {i+1}: {str(e)}")
138
- continue
139
 
140
- # Se nenhuma estratégia funcionou, tenta API alternativa
141
- if not all_images:
142
- print("Tentando método alternativo...")
143
- all_images = await search_alternative_method(q)
144
 
145
- if not all_images:
146
- raise HTTPException(
147
- status_code=503,
148
- detail="Não foi possível acessar o Google Imagens. Tente novamente em alguns minutos."
149
- )
150
 
151
- # Remove duplicatas
152
- unique_images = []
153
- seen_urls = set()
154
- for img in all_images:
155
- if img.get('url') not in seen_urls:
156
- seen_urls.add(img.get('url'))
157
- unique_images.append(img)
158
 
159
  # Processamento paralelo massivo
160
- enriched_images = await enrich_images_ultra_fast(unique_images[:100], include_thumbnails)
 
 
161
 
162
  # Filtragem rápida
163
  valid_images = [
@@ -165,11 +88,32 @@ async def search(
165
  if img.get('width', 0) >= min_width and img.get('height', 0) > 0
166
  ]
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Ordenação e limitação
169
  valid_images.sort(key=lambda x: x.get('width', 0), reverse=True)
170
  final_images = valid_images[:50]
171
 
172
  total_time = time.time() - start_time
 
173
 
174
  return JSONResponse(content={
175
  "query": q,
@@ -180,51 +124,12 @@ async def search(
180
  "images": final_images
181
  })
182
 
183
- except HTTPException:
184
- raise
185
  except Exception as e:
186
  raise HTTPException(status_code=500, detail=f"Erro ao executar a busca: {str(e)}")
187
 
188
 
189
- async def search_alternative_method(query: str) -> List[Dict]:
190
- """
191
- Método alternativo usando diferentes endpoints
192
- """
193
- try:
194
- # Tenta o endpoint de imagens do Bing como fallback
195
- bing_url = "https://www.bing.com/images/search"
196
- params = {
197
- "q": query,
198
- "FORM": "HDRSC2",
199
- "first": "1",
200
- "tsc": "ImageHoverTitle"
201
- }
202
-
203
- headers = {
204
- "User-Agent": random.choice(USER_AGENTS),
205
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
206
- "Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3",
207
- "Connection": "keep-alive",
208
- }
209
-
210
- async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
211
- response = await client.get(bing_url, params=params, headers=headers)
212
-
213
- if response.status_code == 200:
214
- # Extrai URLs de imagens do Bing
215
- image_urls = re.findall(
216
- r'"murl":"([^"]+)"',
217
- response.text
218
- )
219
-
220
- return [{"url": url.replace("\\u0026", "&"), "width": None, "height": None}
221
- for url in image_urls[:50]]
222
- except:
223
- pass
224
-
225
- return []
226
-
227
-
228
  @lru_cache(maxsize=500)
229
  def clean_wikimedia_url_cached(url: str) -> str:
230
  """
@@ -248,40 +153,24 @@ def clean_wikimedia_url_cached(url: str) -> str:
248
 
249
  def extract_images_from_response_optimized(response_text: str) -> List[Dict]:
250
  """
251
- Extração ultra-otimizada usando múltiplos padrões
252
  """
253
- images = []
254
- seen_urls = set()
255
 
256
- # Múltiplos padrões para diferentes estruturas do Google
257
- patterns = [
258
- # Padrão principal
259
- re.compile(r'"ou":"([^"]+)"', re.IGNORECASE),
260
- # URLs diretas
261
- re.compile(r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)(?:\?[^\s"\'<>]*)?', re.IGNORECASE),
262
- # URLs escapadas
263
- re.compile(r'\\u003d([^\\u0026]+)\\u0026', re.IGNORECASE),
264
- # Padrão alternativo
265
- re.compile(r'"(https?://[^"]*\.(?:jpg|jpeg|png|webp)[^"]*)"', re.IGNORECASE)
266
- ]
267
 
268
- for pattern in patterns:
269
- matches = pattern.findall(response_text)
270
- for match in matches[:100]: # Limita por padrão
271
- # Limpa URL
272
- clean_url = (match.replace('\\u003d', '=')
273
- .replace('\\u0026', '&')
274
- .replace('\\/', '/')
275
- .replace('\\\\', ''))
276
-
277
- clean_url = clean_wikimedia_url_cached(clean_url)
278
-
279
- if (clean_url not in seen_urls and
280
- any(ext in clean_url.lower() for ext in ['.jpg', '.jpeg', '.png', '.webp']) and
281
- not any(skip in clean_url.lower() for skip in ['favicon', 'logo', 'icon'])):
282
-
283
- seen_urls.add(clean_url)
284
- images.append({"url": clean_url, "width": None, "height": None})
285
 
286
  return images
287
 
@@ -377,11 +266,9 @@ async def download_and_process_image(session: httpx.AsyncClient, url: str, inclu
377
  clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/')
378
 
379
  headers = {
380
- 'User-Agent': random.choice(USER_AGENTS),
381
- 'Accept': 'image/*,*/*;q=0.8',
382
- 'Accept-Encoding': 'gzip, deflate',
383
- 'Connection': 'keep-alive',
384
- 'DNT': '1'
385
  }
386
 
387
  width, height, thumbnail_b64 = None, None, None
@@ -469,28 +356,27 @@ async def enrich_images_ultra_fast(images: List[Dict], include_thumbnails: bool
469
  if not images:
470
  return []
471
 
472
- # Configuração HTTP otimizada
473
  connector = httpx.AsyncClient(
474
  timeout=httpx.Timeout(10.0),
475
  limits=httpx.Limits(
476
- max_keepalive_connections=100,
477
- max_connections=150,
478
- keepalive_expiry=30.0
479
  ),
480
- follow_redirects=True,
481
- http2=False
482
  )
483
 
484
- # Semáforo mais conservador para evitar rate limiting
485
- semaphore = asyncio.Semaphore(25)
486
 
487
  async def process_single_image(image_data):
488
  async with semaphore:
489
- # Delay pequeno aleatório para evitar detecção
490
- await asyncio.sleep(random.uniform(0.1, 0.3))
491
  return await download_and_process_image(connector, image_data["url"], include_thumbnails)
492
 
493
  try:
 
 
494
  # Cria todas as tasks de uma vez
495
  tasks = [process_single_image(img) for img in images]
496
 
@@ -503,9 +389,13 @@ async def enrich_images_ultra_fast(images: List[Dict], include_thumbnails: bool
503
  if not isinstance(result, Exception) and result.get('width') and result.get('height'):
504
  valid_results.append(result)
505
 
 
 
 
506
  return valid_results
507
 
508
  except Exception as e:
 
509
  return []
510
  finally:
511
  await connector.aclose()
@@ -521,7 +411,7 @@ async def get_thumbnail_fast(
521
  Obtém miniatura ultra-rápida de uma imagem específica
522
  """
523
  try:
524
- async with httpx.AsyncClient(timeout=8.0, follow_redirects=True) as client:
525
  result = await download_and_process_image(client, url, True)
526
 
527
  if result.get('thumbnail'):
@@ -538,36 +428,6 @@ async def get_thumbnail_fast(
538
  raise HTTPException(status_code=500, detail=f"Erro: {str(e)}")
539
 
540
 
541
- # Endpoint de diagnóstico
542
- @router.get("/status")
543
- async def status_check():
544
- """
545
- Verifica se o serviço está funcionando e se consegue acessar o Google
546
- """
547
- try:
548
- async with httpx.AsyncClient(timeout=10.0) as client:
549
- headers = {
550
- "User-Agent": random.choice(USER_AGENTS),
551
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
552
- }
553
-
554
- response = await client.get("https://www.google.com", headers=headers)
555
-
556
- return JSONResponse(content={
557
- "status": "ok",
558
- "google_accessible": response.status_code == 200,
559
- "cache_size": len(_url_cache),
560
- "timestamp": time.time()
561
- })
562
- except Exception as e:
563
- return JSONResponse(content={
564
- "status": "error",
565
- "error": str(e),
566
- "cache_size": len(_url_cache),
567
- "timestamp": time.time()
568
- })
569
-
570
-
571
  # Cleanup do executor na finalização
572
  import atexit
573
  atexit.register(lambda: thumbnail_executor.shutdown(wait=False))
 
4
  import os
5
  import json
6
  import re
7
+ from urllib.parse import unquote
8
  from PIL import Image
9
  import io
10
  import asyncio
 
15
  import aiofiles
16
  from concurrent.futures import ThreadPoolExecutor
17
  import time
 
 
 
 
 
18
 
19
  router = APIRouter()
20
 
 
28
  _url_cache = {}
29
  _cache_max_size = 1000
30
 
 
 
 
 
 
 
 
 
 
31
  @router.get("/search")
32
  async def search(
33
  q: str = Query(..., description="Termo de pesquisa para imagens"),
 
35
  include_thumbnails: bool = Query(True, description="Incluir miniaturas base64 nas respostas")
36
  ):
37
  """
38
+ Busca imagens no Google Imagens com máxima performance
39
  """
40
  start_time = time.time()
41
 
42
+ # URL do Google Imagens com parâmetros para imagens grandes
43
+ google_images_url = "http://www.google.com/search"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ params = {
46
+ "tbm": "isch",
47
+ "q": q,
48
+ "start": 0,
49
+ "sa": "N",
50
+ "asearch": "arc",
51
+ "cs": "1",
52
+ "tbs": "isz:l",
53
+ "async": f"arc_id:srp_GgSMaOPQOtL_5OUPvbSTOQ_110,ffilt:all,ve_name:MoreResultsContainer,inf:1,_id:arc-srp_GgSMaOPQOtL_5OUPvbSTOQ_110,_pms:s,_fmt:pc"
54
+ }
 
 
 
 
 
 
55
 
56
+ headers = {
57
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
58
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
59
+ "Accept-Language": "pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3",
60
+ "Accept-Encoding": "gzip, deflate",
61
+ "Connection": "keep-alive",
62
+ "Referer": "https://www.google.com/"
63
+ }
64
 
65
  try:
66
+ # Busca no Google (rápida)
67
+ async with httpx.AsyncClient(timeout=30.0) as client:
68
+ response = await client.get(google_images_url, params=params, headers=headers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ if response.status_code != 200:
71
+ raise HTTPException(status_code=response.status_code, detail="Erro ao buscar no Google Imagens")
 
 
72
 
73
+ print(f"Google respondeu em {time.time() - start_time:.2f}s")
74
+ extract_start = time.time()
 
 
 
75
 
76
+ # Extração otimizada
77
+ images = extract_images_from_response_optimized(response.text)
78
+ print(f"Extração concluída em {time.time() - extract_start:.2f}s - {len(images)} URLs")
 
 
 
 
79
 
80
  # Processamento paralelo massivo
81
+ processing_start = time.time()
82
+ enriched_images = await enrich_images_ultra_fast(images, include_thumbnails)
83
+ print(f"Processamento concluído em {time.time() - processing_start:.2f}s")
84
 
85
  # Filtragem rápida
86
  valid_images = [
 
88
  if img.get('width', 0) >= min_width and img.get('height', 0) > 0
89
  ]
90
 
91
+ # Se poucos resultados, busca adicional em paralelo
92
+ if len(valid_images) < 20:
93
+ params["tbs"] = "isz:lt,islt:4mp"
94
+
95
+ async with httpx.AsyncClient(timeout=30.0) as client:
96
+ response2 = await client.get(google_images_url, params=params, headers=headers)
97
+
98
+ if response2.status_code == 200:
99
+ additional_images = extract_images_from_response_optimized(response2.text)
100
+ additional_enriched = await enrich_images_ultra_fast(additional_images, include_thumbnails)
101
+
102
+ # Merge rápido com set para deduplicação
103
+ seen_urls = {img.get('url') for img in valid_images}
104
+ for img in additional_enriched:
105
+ if (img.get('url') not in seen_urls
106
+ and img.get('width', 0) >= min_width
107
+ and img.get('height', 0) > 0):
108
+ valid_images.append(img)
109
+ seen_urls.add(img.get('url'))
110
+
111
  # Ordenação e limitação
112
  valid_images.sort(key=lambda x: x.get('width', 0), reverse=True)
113
  final_images = valid_images[:50]
114
 
115
  total_time = time.time() - start_time
116
+ print(f"TEMPO TOTAL: {total_time:.2f}s - {len(final_images)} imagens finais")
117
 
118
  return JSONResponse(content={
119
  "query": q,
 
124
  "images": final_images
125
  })
126
 
127
+ except httpx.TimeoutException:
128
+ raise HTTPException(status_code=408, detail="Timeout na requisição ao Google")
129
  except Exception as e:
130
  raise HTTPException(status_code=500, detail=f"Erro ao executar a busca: {str(e)}")
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  @lru_cache(maxsize=500)
134
  def clean_wikimedia_url_cached(url: str) -> str:
135
  """
 
153
 
154
  def extract_images_from_response_optimized(response_text: str) -> List[Dict]:
155
  """
156
+ Extração ultra-otimizada usando regex compilado e processamento em lote
157
  """
158
+ # Regex compilado (mais rápido)
159
+ pattern = re.compile(r'https?://[^\s"\'<>]+?\.(?:jpg|png|webp|jpeg)\b', re.IGNORECASE)
160
 
161
+ # Extração em uma única passada
162
+ image_urls = pattern.findall(response_text)
 
 
 
 
 
 
 
 
 
163
 
164
+ # Deduplicação com set (O(1) lookup)
165
+ seen_urls = set()
166
+ images = []
167
+
168
+ # Processa URLs em lote
169
+ for url in image_urls[:200]: # Aumentado para compensar filtragem
170
+ cleaned_url = clean_wikimedia_url_cached(url)
171
+ if cleaned_url not in seen_urls:
172
+ seen_urls.add(cleaned_url)
173
+ images.append({"url": cleaned_url, "width": None, "height": None})
 
 
 
 
 
 
 
174
 
175
  return images
176
 
 
266
  clean_url = url.replace('\\u003d', '=').replace('\\u0026', '&').replace('\\\\', '').replace('\\/', '/')
267
 
268
  headers = {
269
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
270
+ 'Accept': 'image/*',
271
+ 'Connection': 'close'
 
 
272
  }
273
 
274
  width, height, thumbnail_b64 = None, None, None
 
356
  if not images:
357
  return []
358
 
359
+ # Configuração HTTP2 otimizada para máxima concorrência
360
  connector = httpx.AsyncClient(
361
  timeout=httpx.Timeout(10.0),
362
  limits=httpx.Limits(
363
+ max_keepalive_connections=100, # Muito mais conexões
364
+ max_connections=150, # Pool maior
365
+ keepalive_expiry=30.0 # Mantém conexões por mais tempo
366
  ),
367
+ http2=False # HTTP/1.1 ainda é mais rápido para muitas conexões pequenas
 
368
  )
369
 
370
+ # Semáforo mais agressivo
371
+ semaphore = asyncio.Semaphore(30) # Muito mais concorrência
372
 
373
  async def process_single_image(image_data):
374
  async with semaphore:
 
 
375
  return await download_and_process_image(connector, image_data["url"], include_thumbnails)
376
 
377
  try:
378
+ print(f"Iniciando processamento ultra-paralelo de {len(images)} imagens...")
379
+
380
  # Cria todas as tasks de uma vez
381
  tasks = [process_single_image(img) for img in images]
382
 
 
389
  if not isinstance(result, Exception) and result.get('width') and result.get('height'):
390
  valid_results.append(result)
391
 
392
+ success_rate = len(valid_results) / len(images) * 100
393
+ print(f"Processamento concluído: {len(valid_results)}/{len(images)} ({success_rate:.1f}% sucesso)")
394
+
395
  return valid_results
396
 
397
  except Exception as e:
398
+ print(f"Erro no processamento ultra-rápido: {e}")
399
  return []
400
  finally:
401
  await connector.aclose()
 
411
  Obtém miniatura ultra-rápida de uma imagem específica
412
  """
413
  try:
414
+ async with httpx.AsyncClient(timeout=8.0) as client:
415
  result = await download_and_process_image(client, url, True)
416
 
417
  if result.get('thumbnail'):
 
428
  raise HTTPException(status_code=500, detail=f"Erro: {str(e)}")
429
 
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  # Cleanup do executor na finalização
432
  import atexit
433
  atexit.register(lambda: thumbnail_executor.shutdown(wait=False))