from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from time import sleep from tqdm.notebook import tqdm #from selenium.webdriver.chrome.options import Options import time import sys import requests import http.cookiejar import time import lxml import re import urllib.request import json import ast import datetime from pymongo import MongoClient from datetime import datetime from lxml.html import fragment_fromstring from collections import OrderedDict import urllib.parse def get_data(*args, **kwargs): class AppURLopener(urllib.request.FancyURLopener): version = "Mozilla/5.0" opener = AppURLopener() response = opener.open('http://httpbin.org/user-agent') url = 'http://www.fundamentus.com.br/resultado.php' cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'), ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01') ] #opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), # ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')] # Aqui estão os parâmetros de busca das ações # Estão em branco para que retorne todas as disponíveis data = {'pl_min':'','pl_max':'','pvp_min':'','pvp_max' :'','psr_min':'','psr_max':'','divy_min':'','divy_max':'', 'pativos_min':'','pativos_max':'','pcapgiro_min':'','pcapgiro_max':'','pebit_min':'','pebit_max':'', 'fgrah_min':'', 'fgrah_max':'', 'firma_ebit_min':'', 'firma_ebit_max':'','margemebit_min':'','margemebit_max':'', 'margemliq_min':'','margemliq_max':'', 'liqcorr_min':'','liqcorr_max':'','roic_min':'','roic_max':'','roe_min':'', 'roe_max':'','liq_min':'','liq_max':'','patrim_min':'','patrim_max':'','divbruta_min':'','divbruta_max':'', 'tx_cresc_rec_min':'','tx_cresc_rec_max':'','setor':'','negociada':'ON','ordem':'1','x':'28','y':'16'} with opener.open(url, urllib.parse.urlencode(data).encode('UTF-8')) as link: content = link.read().decode('ISO-8859-1') pattern = re.compile('', re.DOTALL) reg = re.findall(pattern, content)[0] page = fragment_fromstring(reg) lista = OrderedDict() stocks = page.xpath('tbody')[0].findall("tr") todos = [] for i in range(0, len(stocks)): lista[i] = { stocks[i].getchildren()[0][0].getchildren()[0].text: { 'cotacao': stocks[i].getchildren()[1].text, 'P/L': stocks[i].getchildren()[2].text, 'P/VP': stocks[i].getchildren()[3].text, 'PSR': stocks[i].getchildren()[4].text, 'DY': stocks[i].getchildren()[5].text, 'P/Ativo': stocks[i].getchildren()[6].text, 'P/Cap.Giro': stocks[i].getchildren()[7].text, 'P/EBIT': stocks[i].getchildren()[8].text, 'P/Ativ.Circ.Liq.': stocks[i].getchildren()[9].text, 'EV/EBIT': stocks[i].getchildren()[10].text, 'EBITDA': stocks[i].getchildren()[11].text, 'Mrg. Ebit': stocks[i].getchildren()[12].text, 'Mrg.Liq.': stocks[i].getchildren()[13].text, 'Liq.Corr.': stocks[i].getchildren()[14].text, 'ROIC': stocks[i].getchildren()[15].text, 'ROE': stocks[i].getchildren()[16].text, 'Liq.2m.': stocks[i].getchildren()[17].text, 'Pat.Liq': stocks[i].getchildren()[18].text, 'Div.Brut/Pat.': stocks[i].getchildren()[19].text, 'Cresc.5a': stocks[i].getchildren()[20].text } } return lista def get_specific_data(stock): class AppURLopener(urllib.request.FancyURLopener): version = "Mozilla/5.0" opener = AppURLopener() response = opener.open('http://httpbin.org/user-agent') url = "http://www.fundamentus.com.br/detalhes.php?papel=" + stock cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'), ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01') ] #opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'), # ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')] # Get data from site link = opener.open(url, urllib.parse.urlencode({}).encode('UTF-8')) content = link.read().decode('ISO-8859-1') # Get all table instances pattern = re.compile('
.*
', re.DOTALL) reg = re.findall(pattern, content)[0] reg = "
" + reg + "
" page = fragment_fromstring(reg) all_data = {} # There is 5 tables with tr, I will get all trs all_trs = [] all_tables = page.xpath("table") for i in range(0, len(all_tables)): all_trs = all_trs + all_tables[i].findall("tr") # Run through all the trs and get the label and the # data for each line for tr_index in range(0, len(all_trs)): tr = all_trs[tr_index] # Get into td all_tds = tr.getchildren() for td_index in range(0, len(all_tds)): td = all_tds[td_index] label = "" data = "" # The page has tds with contents and some # other with not if (td.get("class").find("label") != -1): # We have a label for span in td.getchildren(): if (span.get("class").find("txt") != -1): label = span.text # If we did find a label we have to look # for a value if (label and len(label) > 0): next_td = all_tds[td_index + 1] if (next_td.get("class").find("data") != -1): # We have a data for span in next_td.getchildren(): if (span.get("class").find("txt") != -1): if (span.text): data = span.text else: # If it is a link span_children = span.getchildren() if (span_children and len(span_children) > 0): data = span_children[0].text # Include into dict all_data[label] = data # Erase it label = "" data = "" return all_data def coletar_scrap(): sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver') URL = 'https://statusinvest.com.br/acoes/busca-avancada' #output = 'busca-avancada.csv' #if path.exists(output): # os.remove(output) #chrome_options = Options() #chrome_options.binary_location = GOOGLE_CHROME_BIN #chrome_options.add_argument('--disable-gpu') #chrome_options.add_argument('--no-sandbox') #driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options) #driver = webdriver.Chrome('chromedriver/chromedriver.exe') #chrome_options = webdriver.ChromeOptions() #chrome_options.add_argument('--headless') #chrome_options.add_argument('--no-sandbox') #chrome_options.add_argument('--disable-dev-shm-usage') #driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options) gChromeOptions = webdriver.ChromeOptions() gChromeOptions.add_argument("window-size=1920x1480") gChromeOptions.add_argument("disable-dev-shm-usage") driver = webdriver.Chrome( chrome_options=gChromeOptions, executable_path=ChromeDriverManager().install() ) #driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.GOOGLE).install()) #driver.get(URL) driver.get(URL) sleep(5) button_buscar = driver.find_element_by_xpath('//div/button[contains(@class,"find")]') button_buscar.click() sleep(5) button_skip = driver.find_element_by_xpath('//div/button[contains(@class,"btn-close")]') button_skip.click() sleep(5) button_download = driver.find_element_by_xpath('//div/a[contains(@class, "btn-download")]') button_download.click() sleep(1) #if path.exists(output): df = pd.read_csv('busca-avancada.csv', sep=';', decimal=',', thousands='.') driver.close() return df def scrap_fundamentus(): url = 'http://www.fundamentus.com.br/resultado.php' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} result = requests.get(url, headers=headers) df = pd.read_html(result.content)[0] return df