|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.common.keys import Keys |
|
from selenium.webdriver.chrome.service import Service |
|
from selenium.common.exceptions import WebDriverException |
|
from bs4 import BeautifulSoup |
|
import time |
|
import random |
|
|
|
|
|
user_agents = [ |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0", |
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", |
|
"Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:87.0) Gecko/20100101 Firefox/87.0", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:88.0) Gecko/20100101 Firefox/88.0", |
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0", |
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0", |
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0", |
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:91.0) Gecko/20100101 Firefox/91.0", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:91.0) Gecko/20100101 Firefox/91.0", |
|
] |
|
|
|
def get_random_user_agent(): |
|
return random.choice(user_agents) |
|
|
|
def extract_data(user_input, mode): |
|
options = webdriver.ChromeOptions() |
|
options.add_argument('--headless') |
|
options.add_argument('--no-sandbox') |
|
options.add_argument('--disable-dev-shm-usage') |
|
options.add_argument(f"user-agent={get_random_user_agent()}") |
|
|
|
try: |
|
wd = webdriver.Chrome(options=options) |
|
wd.set_window_size(1080, 720) |
|
|
|
|
|
url_busqueda = f"https://app.neilpatel.com/es/traffic_analyzer/keywords?domain={user_input}&lang=es&locId=2724&mode={mode}" |
|
wd.get(url_busqueda) |
|
|
|
|
|
time.sleep(random.uniform(10, 20)) |
|
|
|
|
|
page_content = wd.page_source |
|
|
|
except WebDriverException as e: |
|
return [] |
|
finally: |
|
if wd: |
|
wd.quit() |
|
|
|
|
|
soup = BeautifulSoup(page_content, 'html.parser') |
|
|
|
|
|
root_div = soup.find('div', id='root') |
|
if not root_div: |
|
return [] |
|
|
|
|
|
print(root_div.prettify()) |
|
|
|
|
|
texto_plano = root_div.get_text(separator='\n', strip=True) |
|
|
|
|
|
keyword = "Última actualización" |
|
index = texto_plano.find(keyword) |
|
if (index != -1): |
|
texto_plano = texto_plano[index + len(keyword):].strip() |
|
|
|
|
|
lineas = texto_plano.split('\n') |
|
lineas_filtradas = [linea for linea in lineas if "Búsquedas" not in linea] |
|
|
|
|
|
for i, linea in enumerate(lineas_filtradas): |
|
if "ACTUALIZA A PRO" in linea: |
|
lineas_filtradas = lineas_filtradas[:i] |
|
break |
|
|
|
|
|
def parsear_texto(lineas): |
|
datos_parseados = [] |
|
for i in range(0, len(lineas), 7): |
|
if i + 6 < len(lineas): |
|
palabra_clave = lineas[i] |
|
url = lineas[i + 1] |
|
volumen = lineas[i + 2] |
|
posicion = lineas[i + 3] |
|
visitas = lineas[i + 4] |
|
sd = lineas[i + 5] |
|
ultima_actualizacion = lineas[i + 6] |
|
datos_parseados.append({ |
|
"Palabras clave": palabra_clave, |
|
"URL": url, |
|
"Volumen": volumen, |
|
"Posición": posicion, |
|
"Visitas": visitas, |
|
"SD": sd, |
|
"Última actualización": ultima_actualizacion |
|
}) |
|
return datos_parseados |
|
|
|
|
|
datos_parseados = parsear_texto(lineas_filtradas) |
|
return datos_parseados |
|
|