import requests from bs4 import BeautifulSoup import re from urllib.parse import urljoin import numpy as np from sklearn.preprocessing import LabelEncoder import traceback from selenium import webdriver from selenium.webdriver.chrome.service import Service import chromedriver_autoinstaller from selenium.common import exceptions chromedriver_autoinstaller.install() options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--disable-dev-shm-usage") options.add_argument("--no-sandbox") def selnium(url): try: driver = webdriver.Chrome(options=options) driver.get(url) with open("temp/temp.html", "w+") as f: f.write(driver.page_source) driver.quit() return True except exceptions.InvalidSessionIdException as e: print(traceback.format_exc()) print(e.message) return False except BaseException as e: print(traceback.format_exc()) print(e.message) return False def get_batting_team(soup, status, inning, teams_this_match): # teams_this_match = sorted( # np.load("team.npy", allow_pickle=True), # key=lambda x: soup.text.lower().count(x.lower()), # )[-2:] # print(f"{teams_this_match=}") batting_team = "" if inning == 2: batting_team = status.split("need")[0].strip() for idx, team in enumerate(teams_this_match): if team.lower() in batting_team.lower(): batting_team = team else: for idx, team in enumerate(teams_this_match): if team.lower() in status.lower(): if "opt to bowl" in status.lower(): batting_team = teams_this_match[int(~idx)] elif "opt to bat" in status.lower(): batting_team = team else: print("Could not get batting team)") bowling_team = list(set(teams_this_match).difference([batting_team]))[0] print(f"{batting_team=}, {bowling_team=}") batting_team_enc, bowling_team_enc = None, None le = LabelEncoder() le.classes_ = np.load("model/team.npy", allow_pickle=True) if batting_team in le.classes_: batting_team_enc = le.transform([batting_team])[0] if bowling_team in le.classes_: bowling_team_enc = le.transform([bowling_team])[0] return batting_team, bowling_team, batting_team_enc, bowling_team_enc def scrape(url): try: if selnium(url) is False: return ("Selenium scrape error",) soup = BeautifulSoup(open("temp/temp.html", "r").read(), "html.parser") # print("Debug>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.", soup.text) matchState = re.findall( 'var matchState ="([\da-zA-Z]*)"', "\n".join(map(lambda x: x.text, soup.find_all("script"))), )[0].lower() print(f"{matchState=}") title = soup.find_all("title")[0].text format = re.findall( 'var matchFormat = "([\da-zA-Z]*)"', "\n".join(map(lambda x: x.text, soup.find_all("script"))), )[0] print(f"{format=}") if format not in {"ODI", "T20"}: raise BaseException("Not ODI or T20") status = ( soup.find_all("div", {"class": "cb-text-inprogress"})[0].text if matchState == "inprogress" else soup.find_all("div", {"class": "cb-text-complete"})[0].text if matchState == "complete" else soup.find_all("div", {"class": "cb-text-inningsbreak"})[0].text if matchState == "inningsbreak" else "" ) score = ( soup.find_all("div", {"class": "cb-min-bat-rw"})[0].text if matchState in ["complete", "inprogress", "inningbreak"] else "" ) if matchState != "inprogress": return ( matchState, score, None, None, None, None, None, None, None, None, format, title, status, None, None, None, None, None, ) teams_this_match = re.match( r"(.*) vs (.*)", soup.find_all("a", {"class": "cb-nav-tab"})[0]["title"].split(",")[0], ).groups() print(f"{teams_this_match=}") data = re.findall("(\d+)/(\d+) \(([\.\d]+)\)", soup.text) runs, wkts, overs = map(float, data[-1]) print(f"{runs=}, {wkts=}, {overs=}") if overs >= 5: last_5_ovs = ( soup.find_all("span", string="Last 5 overs")[0].findNext("span").text ) run_last_5_overs, wkt_last_5_overs = map( float, re.match("(\d+) runs, (\d+) wkts", last_5_ovs).groups() ) else: run_last_5_overs, wkt_last_5_overs = runs, wkts print(f"{run_last_5_overs=}, {wkt_last_5_overs=}") req_rr = -9999 if soup.find_all("span", string="\xa0\xa0REQ:\xa0"): reqdata = ( soup.find_all("span", string="\xa0\xa0REQ:\xa0")[0] .findNext("span") .text ) if reqdata.strip() != "": req_rr = list(map(float, re.match("([\d\.]+)", reqdata).groups()))[0] else: print("REQ_RR not parsed") crr = -9999 if soup.find_all("span", string="\xa0\xa0CRR:\xa0"): crrdata = ( soup.find_all("span", string="\xa0\xa0CRR:\xa0")[0] .findNext("span") .text ) if crrdata.strip() != "": crr = list(map(float, re.match("([\d\.]+)", crrdata).groups()))[0] else: print("CRR not parsed") print(f"{crr=}, {req_rr=}") inning = 2 if req_rr > 0 else 1 ( batting_team, bowling_team, batting_team_enc, bowling_team_enc, ) = get_batting_team(soup, status, inning, teams_this_match) req = -9999 if inning == 2: req = int(re.match(r".*need (\d+) runs", status).groups()[0]) print(f"{req=}") else: print("Not chasing so target not set") return ( matchState, score, run_last_5_overs, wkt_last_5_overs, runs, wkts, overs, req_rr, req, crr, format, title, status, batting_team, bowling_team, batting_team_enc, bowling_team_enc, inning, ) except BaseException as e: print(traceback.format_exc()) return (str(e),) def get_live_matches(url): if selnium(url) is False: return None soup = BeautifulSoup(open("temp/temp.html", "r").read(), "html.parser") matches = soup.find_all("a", {"class": "cb-mat-mnu-itm cb-ovr-flo"}) return { m.text: urljoin(url, m.get("href")) for m in matches if m not in soup.find_all("a", {"id": "live-scores-link"}) } if __name__ == "__main__": url = "https://cricbuzz.com/live-cricket-scores/79055/wa-vs-saus-3rd-match-australia-domestic-one-day-cup-2023-24" print(scrape(url)) # print(get_live_matches("https://cricbuzz.com"))