Update extract.py
Browse files- extract.py +8 -35
extract.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
from selenium import webdriver
|
|
|
|
|
|
|
2 |
from selenium.common.exceptions import WebDriverException
|
3 |
from bs4 import BeautifulSoup
|
4 |
import time
|
5 |
import random
|
6 |
-
import logging
|
7 |
-
from fp.fp import FreeProxy
|
8 |
|
9 |
# Lista de User Agents para rotar
|
10 |
user_agents = [
|
@@ -33,38 +34,19 @@ user_agents = [
|
|
33 |
def get_random_user_agent():
|
34 |
return random.choice(user_agents)
|
35 |
|
36 |
-
def get_random_window_size():
|
37 |
-
window_sizes = [
|
38 |
-
(1920, 1080), (1366, 768), (1440, 900), (1536, 864), (1280, 800), (1280, 720), (1024, 768)
|
39 |
-
]
|
40 |
-
return random.choice(window_sizes)
|
41 |
-
|
42 |
-
def get_proxy():
|
43 |
-
proxy = FreeProxy(rand=True, timeout=1).get()
|
44 |
-
return proxy
|
45 |
-
|
46 |
def extract_data(user_input, mode):
|
47 |
-
proxy = get_proxy()
|
48 |
-
proxy_url = f"http://{proxy}"
|
49 |
-
|
50 |
options = webdriver.ChromeOptions()
|
51 |
options.add_argument('--headless')
|
52 |
options.add_argument('--no-sandbox')
|
53 |
options.add_argument('--disable-dev-shm-usage')
|
54 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
55 |
-
options.add_argument('--proxy-server=%s' % proxy_url)
|
56 |
|
57 |
-
wd = None
|
58 |
try:
|
59 |
wd = webdriver.Chrome(options=options)
|
60 |
-
|
61 |
-
wd.set_window_size(window_size[0], window_size[1])
|
62 |
|
63 |
# Construir la URL de b煤squeda
|
64 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
65 |
-
|
66 |
-
logging.info(f"Making request to {url_busqueda} with IP: {proxy_url}")
|
67 |
-
|
68 |
wd.get(url_busqueda)
|
69 |
|
70 |
# Espera aleatoria para simular el comportamiento humano
|
@@ -73,15 +55,7 @@ def extract_data(user_input, mode):
|
|
73 |
# Obtener el contenido de la p谩gina
|
74 |
page_content = wd.page_source
|
75 |
|
76 |
-
# Obtener el c贸digo de respuesta HTTP
|
77 |
-
response_status = wd.execute_script("return document.readyState")
|
78 |
-
if response_status == "complete":
|
79 |
-
logging.info(f"Request with IP: {proxy_url} returned status code 200")
|
80 |
-
else:
|
81 |
-
logging.warning(f"Request with IP: {proxy_url} did not return status code 200")
|
82 |
-
|
83 |
except WebDriverException as e:
|
84 |
-
logging.error(f"Request failed with proxy {proxy_url}. Error: {e}")
|
85 |
return []
|
86 |
finally:
|
87 |
if wd:
|
@@ -93,19 +67,18 @@ def extract_data(user_input, mode):
|
|
93 |
# Buscar el div con id="root"
|
94 |
root_div = soup.find('div', id='root')
|
95 |
if not root_div:
|
96 |
-
logging.error("No se encontr贸 el div con id 'root'")
|
97 |
return []
|
98 |
|
|
|
|
|
|
|
99 |
# Extraer el texto plano dentro del div
|
100 |
texto_plano = root_div.get_text(separator='\n', strip=True)
|
101 |
|
102 |
-
# Log el contenido del div root
|
103 |
-
logging.info(f"Contenido del div 'root':\n{texto_plano}")
|
104 |
-
|
105 |
# Buscar la palabra clave espec铆fica "脷ltima actualizaci贸n" y descartar todo lo anterior
|
106 |
keyword = "脷ltima actualizaci贸n"
|
107 |
index = texto_plano.find(keyword)
|
108 |
-
if index != -1:
|
109 |
texto_plano = texto_plano[index + len(keyword):].strip()
|
110 |
|
111 |
# Eliminar todas las l铆neas que contienen la palabra "B煤squedas"
|
|
|
1 |
from selenium import webdriver
|
2 |
+
from selenium.webdriver.common.by import By
|
3 |
+
from selenium.webdriver.common.keys import Keys
|
4 |
+
from selenium.webdriver.chrome.service import Service
|
5 |
from selenium.common.exceptions import WebDriverException
|
6 |
from bs4 import BeautifulSoup
|
7 |
import time
|
8 |
import random
|
|
|
|
|
9 |
|
10 |
# Lista de User Agents para rotar
|
11 |
user_agents = [
|
|
|
34 |
def get_random_user_agent():
|
35 |
return random.choice(user_agents)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def extract_data(user_input, mode):
|
|
|
|
|
|
|
38 |
options = webdriver.ChromeOptions()
|
39 |
options.add_argument('--headless')
|
40 |
options.add_argument('--no-sandbox')
|
41 |
options.add_argument('--disable-dev-shm-usage')
|
42 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
|
|
43 |
|
|
|
44 |
try:
|
45 |
wd = webdriver.Chrome(options=options)
|
46 |
+
wd.set_window_size(1080, 720)
|
|
|
47 |
|
48 |
# Construir la URL de b煤squeda
|
49 |
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}"
|
|
|
|
|
|
|
50 |
wd.get(url_busqueda)
|
51 |
|
52 |
# Espera aleatoria para simular el comportamiento humano
|
|
|
55 |
# Obtener el contenido de la p谩gina
|
56 |
page_content = wd.page_source
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
except WebDriverException as e:
|
|
|
59 |
return []
|
60 |
finally:
|
61 |
if wd:
|
|
|
67 |
# Buscar el div con id="root"
|
68 |
root_div = soup.find('div', id='root')
|
69 |
if not root_div:
|
|
|
70 |
return []
|
71 |
|
72 |
+
# Imprimir el contenido del div con id="root"
|
73 |
+
print(root_div.prettify())
|
74 |
+
|
75 |
# Extraer el texto plano dentro del div
|
76 |
texto_plano = root_div.get_text(separator='\n', strip=True)
|
77 |
|
|
|
|
|
|
|
78 |
# Buscar la palabra clave espec铆fica "脷ltima actualizaci贸n" y descartar todo lo anterior
|
79 |
keyword = "脷ltima actualizaci贸n"
|
80 |
index = texto_plano.find(keyword)
|
81 |
+
if (index != -1):
|
82 |
texto_plano = texto_plano[index + len(keyword):].strip()
|
83 |
|
84 |
# Eliminar todas las l铆neas que contienen la palabra "B煤squedas"
|