Update extract.py
Browse files- extract.py +16 -1
extract.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1 |
from selenium import webdriver
|
|
|
|
|
|
|
2 |
from selenium.common.exceptions import WebDriverException
|
|
|
3 |
import time
|
4 |
import random
|
5 |
-
|
|
|
6 |
|
7 |
# Lista de User Agents para rotar
|
8 |
user_agents = [
|
@@ -28,6 +33,7 @@ user_agents = [
|
|
28 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:91.0) Gecko/20100101 Firefox/91.0",
|
29 |
]
|
30 |
|
|
|
31 |
def get_random_user_agent():
|
32 |
return random.choice(user_agents)
|
33 |
|
@@ -37,12 +43,20 @@ def get_random_window_size():
|
|
37 |
]
|
38 |
return random.choice(window_sizes)
|
39 |
|
|
|
|
|
|
|
|
|
40 |
def extract_data(user_input, mode):
|
|
|
|
|
|
|
41 |
options = webdriver.ChromeOptions()
|
42 |
options.add_argument('--headless')
|
43 |
options.add_argument('--no-sandbox')
|
44 |
options.add_argument('--disable-dev-shm-usage')
|
45 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
|
|
46 |
|
47 |
try:
|
48 |
wd = webdriver.Chrome(options=options)
|
@@ -60,6 +74,7 @@ def extract_data(user_input, mode):
|
|
60 |
page_content = wd.page_source
|
61 |
|
62 |
except WebDriverException as e:
|
|
|
63 |
return []
|
64 |
finally:
|
65 |
if wd:
|
|
|
1 |
from selenium import webdriver
|
2 |
+
from selenium.webdriver.common.by import By
|
3 |
+
from selenium.webdriver.common.keys import Keys
|
4 |
+
from selenium.webdriver.chrome.service import Service
|
5 |
from selenium.common.exceptions import WebDriverException
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
import time
|
8 |
import random
|
9 |
+
import requests
|
10 |
+
from fp.fp import FreeProxy
|
11 |
|
12 |
# Lista de User Agents para rotar
|
13 |
user_agents = [
|
|
|
33 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:91.0) Gecko/20100101 Firefox/91.0",
|
34 |
]
|
35 |
|
36 |
+
|
37 |
def get_random_user_agent():
|
38 |
return random.choice(user_agents)
|
39 |
|
|
|
43 |
]
|
44 |
return random.choice(window_sizes)
|
45 |
|
46 |
+
def get_proxy():
|
47 |
+
proxy = FreeProxy(rand=True, timeout=1).get()
|
48 |
+
return proxy
|
49 |
+
|
50 |
def extract_data(user_input, mode):
|
51 |
+
proxy = get_proxy()
|
52 |
+
proxy_url = f"http://{proxy}"
|
53 |
+
|
54 |
options = webdriver.ChromeOptions()
|
55 |
options.add_argument('--headless')
|
56 |
options.add_argument('--no-sandbox')
|
57 |
options.add_argument('--disable-dev-shm-usage')
|
58 |
options.add_argument(f"user-agent={get_random_user_agent()}")
|
59 |
+
options.add_argument('--proxy-server=%s' % proxy_url)
|
60 |
|
61 |
try:
|
62 |
wd = webdriver.Chrome(options=options)
|
|
|
74 |
page_content = wd.page_source
|
75 |
|
76 |
except WebDriverException as e:
|
77 |
+
print(f"Request failed with proxy {proxy_url}. Error: {e}")
|
78 |
return []
|
79 |
finally:
|
80 |
if wd:
|