ativos / scripts /scrap.py
Lucas Vasconcelos Rocha
Add application file
4565b50
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
#from tqdm.notebook import tqdm
#from selenium.webdriver.chrome.options import Options
import sys
import requests
import http.cookiejar
import re
import urllib.request
from lxml.html import fragment_fromstring
from collections import OrderedDict
import urllib.parse
import pandas as pd
def get_data(*args, **kwargs):
class AppURLopener(urllib.request.FancyURLopener):
version = "Mozilla/5.0"
opener = AppURLopener()
response = opener.open('http://httpbin.org/user-agent')
url = 'http://www.fundamentus.com.br/resultado.php'
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'),
('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')
]
#opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
# ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]
# Aqui estão os parâmetros de busca das ações
# Estão em branco para que retorne todas as disponíveis
data = {'pl_min':'','pl_max':'','pvp_min':'','pvp_max' :'','psr_min':'','psr_max':'','divy_min':'','divy_max':'', 'pativos_min':'','pativos_max':'','pcapgiro_min':'','pcapgiro_max':'','pebit_min':'','pebit_max':'', 'fgrah_min':'',
'fgrah_max':'', 'firma_ebit_min':'', 'firma_ebit_max':'','margemebit_min':'','margemebit_max':'', 'margemliq_min':'','margemliq_max':'', 'liqcorr_min':'','liqcorr_max':'','roic_min':'','roic_max':'','roe_min':'', 'roe_max':'','liq_min':'','liq_max':'','patrim_min':'','patrim_max':'','divbruta_min':'','divbruta_max':'', 'tx_cresc_rec_min':'','tx_cresc_rec_max':'','setor':'','negociada':'ON','ordem':'1','x':'28','y':'16'}
with opener.open(url, urllib.parse.urlencode(data).encode('UTF-8')) as link:
content = link.read().decode('ISO-8859-1')
pattern = re.compile('<table id="resultado".*</table>', re.DOTALL)
reg = re.findall(pattern, content)[0]
page = fragment_fromstring(reg)
lista = OrderedDict()
stocks = page.xpath('tbody')[0].findall("tr")
todos = []
for i in range(0, len(stocks)):
lista[i] = {
stocks[i].getchildren()[0][0].getchildren()[0].text: {
'cotacao': stocks[i].getchildren()[1].text,
'P/L': stocks[i].getchildren()[2].text,
'P/VP': stocks[i].getchildren()[3].text,
'PSR': stocks[i].getchildren()[4].text,
'DY': stocks[i].getchildren()[5].text,
'P/Ativo': stocks[i].getchildren()[6].text,
'P/Cap.Giro': stocks[i].getchildren()[7].text,
'P/EBIT': stocks[i].getchildren()[8].text,
'P/Ativ.Circ.Liq.': stocks[i].getchildren()[9].text,
'EV/EBIT': stocks[i].getchildren()[10].text,
'EBITDA': stocks[i].getchildren()[11].text,
'Mrg. Ebit': stocks[i].getchildren()[12].text,
'Mrg.Liq.': stocks[i].getchildren()[13].text,
'Liq.Corr.': stocks[i].getchildren()[14].text,
'ROIC': stocks[i].getchildren()[15].text,
'ROE': stocks[i].getchildren()[16].text,
'Liq.2m.': stocks[i].getchildren()[17].text,
'Pat.Liq': stocks[i].getchildren()[18].text,
'Div.Brut/Pat.': stocks[i].getchildren()[19].text,
'Cresc.5a': stocks[i].getchildren()[20].text
}
}
return lista
def get_specific_data(stock):
class AppURLopener(urllib.request.FancyURLopener):
version = "Mozilla/5.0"
opener = AppURLopener()
response = opener.open('http://httpbin.org/user-agent')
url = "http://www.fundamentus.com.br/detalhes.php?papel=" + stock
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'),
('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')
]
#opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
# ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]
# Get data from site
link = opener.open(url, urllib.parse.urlencode({}).encode('UTF-8'))
content = link.read().decode('ISO-8859-1')
# Get all table instances
pattern = re.compile('<table class="w728">.*</table>', re.DOTALL)
reg = re.findall(pattern, content)[0]
reg = "<div>" + reg + "</div>"
page = fragment_fromstring(reg)
all_data = {}
# There is 5 tables with tr, I will get all trs
all_trs = []
all_tables = page.xpath("table")
for i in range(0, len(all_tables)):
all_trs = all_trs + all_tables[i].findall("tr")
# Run through all the trs and get the label and the
# data for each line
for tr_index in range(0, len(all_trs)):
tr = all_trs[tr_index]
# Get into td
all_tds = tr.getchildren()
for td_index in range(0, len(all_tds)):
td = all_tds[td_index]
label = ""
data = ""
# The page has tds with contents and some
# other with not
if (td.get("class").find("label") != -1):
# We have a label
for span in td.getchildren():
if (span.get("class").find("txt") != -1):
label = span.text
# If we did find a label we have to look
# for a value
if (label and len(label) > 0):
next_td = all_tds[td_index + 1]
if (next_td.get("class").find("data") != -1):
# We have a data
for span in next_td.getchildren():
if (span.get("class").find("txt") != -1):
if (span.text):
data = span.text
else:
# If it is a link
span_children = span.getchildren()
if (span_children and len(span_children) > 0):
data = span_children[0].text
# Include into dict
all_data[label] = data
# Erase it
label = ""
data = ""
return all_data
def coletar_scrap():
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
URL = 'https://statusinvest.com.br/acoes/busca-avancada'
#output = 'busca-avancada.csv'
#if path.exists(output):
# os.remove(output)
#chrome_options = Options()
#chrome_options.binary_location = GOOGLE_CHROME_BIN
#chrome_options.add_argument('--disable-gpu')
#chrome_options.add_argument('--no-sandbox')
#driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options)
#driver = webdriver.Chrome('chromedriver/chromedriver.exe')
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')
#driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
gChromeOptions = webdriver.ChromeOptions()
gChromeOptions.add_argument("window-size=1920x1480")
gChromeOptions.add_argument("disable-dev-shm-usage")
driver = webdriver.Chrome(
chrome_options=gChromeOptions, executable_path=ChromeDriverManager().install()
)
#driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.GOOGLE).install())
#driver.get(URL)
driver.get(URL)
sleep(5)
button_buscar = driver.find_element_by_xpath('//div/button[contains(@class,"find")]')
button_buscar.click()
sleep(5)
button_skip = driver.find_element_by_xpath('//div/button[contains(@class,"btn-close")]')
button_skip.click()
sleep(5)
button_download = driver.find_element_by_xpath('//div/a[contains(@class, "btn-download")]')
button_download.click()
sleep(1)
#if path.exists(output):
df = pd.read_csv('busca-avancada.csv', sep=';', decimal=',', thousands='.')
driver.close()
return df
def scrap_fundamentus():
url = 'http://www.fundamentus.com.br/resultado.php'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
df = pd.read_html(result.content)[0]
return df