Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin | |
import numpy as np | |
from sklearn.preprocessing import LabelEncoder | |
import traceback | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
import chromedriver_autoinstaller | |
from selenium.common import exceptions | |
chromedriver_autoinstaller.install() | |
options = webdriver.ChromeOptions() | |
options.add_argument("--headless") | |
options.add_argument("--disable-dev-shm-usage") | |
options.add_argument("--no-sandbox") | |
def selnium(url): | |
try: | |
driver = webdriver.Chrome(options=options) | |
driver.get(url) | |
with open("temp/temp.html", "w+") as f: | |
f.write(driver.page_source) | |
driver.quit() | |
return True | |
except exceptions.InvalidSessionIdException as e: | |
print(traceback.format_exc()) | |
print(e.message) | |
return False | |
except BaseException as e: | |
print(traceback.format_exc()) | |
print(e.message) | |
return False | |
def get_batting_team(soup, status, inning, teams_this_match): | |
# teams_this_match = sorted( | |
# np.load("team.npy", allow_pickle=True), | |
# key=lambda x: soup.text.lower().count(x.lower()), | |
# )[-2:] | |
# print(f"{teams_this_match=}") | |
batting_team = "" | |
if inning == 2: | |
batting_team = status.split("need")[0].strip() | |
for idx, team in enumerate(teams_this_match): | |
if team.lower() in batting_team.lower(): | |
batting_team = team | |
else: | |
for idx, team in enumerate(teams_this_match): | |
if team.lower() in status.lower(): | |
if "opt to bowl" in status.lower(): | |
batting_team = teams_this_match[int(~idx)] | |
elif "opt to bat" in status.lower(): | |
batting_team = team | |
else: | |
print("Could not get batting team)") | |
bowling_team = list(set(teams_this_match).difference([batting_team]))[0] | |
print(f"{batting_team=}, {bowling_team=}") | |
batting_team_enc, bowling_team_enc = None, None | |
le = LabelEncoder() | |
le.classes_ = np.load("model/team.npy", allow_pickle=True) | |
if batting_team in le.classes_: | |
batting_team_enc = le.transform([batting_team])[0] | |
if bowling_team in le.classes_: | |
bowling_team_enc = le.transform([bowling_team])[0] | |
return batting_team, bowling_team, batting_team_enc, bowling_team_enc | |
def scrape(url): | |
try: | |
if selnium(url) is False: | |
return ("Selenium scrape error",) | |
soup = BeautifulSoup(open("temp/temp.html", "r").read(), "html.parser") | |
# print("Debug>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.", soup.text) | |
matchState = re.findall( | |
'var matchState ="([\da-zA-Z]*)"', | |
"\n".join(map(lambda x: x.text, soup.find_all("script"))), | |
)[0].lower() | |
print(f"{matchState=}") | |
title = soup.find_all("title")[0].text | |
format = re.findall( | |
'var matchFormat = "([\da-zA-Z]*)"', | |
"\n".join(map(lambda x: x.text, soup.find_all("script"))), | |
)[0] | |
print(f"{format=}") | |
if format not in {"ODI", "T20"}: | |
raise BaseException("Not ODI or T20") | |
status = ( | |
soup.find_all("div", {"class": "cb-text-inprogress"})[0].text | |
if matchState == "inprogress" | |
else soup.find_all("div", {"class": "cb-text-complete"})[0].text | |
if matchState == "complete" | |
else soup.find_all("div", {"class": "cb-text-inningsbreak"})[0].text | |
if matchState == "inningsbreak" | |
else "" | |
) | |
score = ( | |
soup.find_all("div", {"class": "cb-min-bat-rw"})[0].text | |
if matchState in ["complete", "inprogress", "inningbreak"] | |
else "" | |
) | |
if matchState != "inprogress": | |
return ( | |
matchState, | |
score, | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
format, | |
title, | |
status, | |
None, | |
None, | |
None, | |
None, | |
None, | |
) | |
teams_this_match = re.match( | |
r"(.*) vs (.*)", | |
soup.find_all("a", {"class": "cb-nav-tab"})[0]["title"].split(",")[0], | |
).groups() | |
print(f"{teams_this_match=}") | |
data = re.findall("(\d+)/(\d+) \(([\.\d]+)\)", soup.text) | |
runs, wkts, overs = map(float, data[-1]) | |
print(f"{runs=}, {wkts=}, {overs=}") | |
if overs >= 5: | |
last_5_ovs = ( | |
soup.find_all("span", string="Last 5 overs")[0].findNext("span").text | |
) | |
run_last_5_overs, wkt_last_5_overs = map( | |
float, re.match("(\d+) runs, (\d+) wkts", last_5_ovs).groups() | |
) | |
else: | |
run_last_5_overs, wkt_last_5_overs = runs, wkts | |
print(f"{run_last_5_overs=}, {wkt_last_5_overs=}") | |
req_rr = -9999 | |
if soup.find_all("span", string="\xa0\xa0REQ:\xa0"): | |
reqdata = ( | |
soup.find_all("span", string="\xa0\xa0REQ:\xa0")[0] | |
.findNext("span") | |
.text | |
) | |
if reqdata.strip() != "": | |
req_rr = list(map(float, re.match("([\d\.]+)", reqdata).groups()))[0] | |
else: | |
print("REQ_RR not parsed") | |
crr = -9999 | |
if soup.find_all("span", string="\xa0\xa0CRR:\xa0"): | |
crrdata = ( | |
soup.find_all("span", string="\xa0\xa0CRR:\xa0")[0] | |
.findNext("span") | |
.text | |
) | |
if crrdata.strip() != "": | |
crr = list(map(float, re.match("([\d\.]+)", crrdata).groups()))[0] | |
else: | |
print("CRR not parsed") | |
print(f"{crr=}, {req_rr=}") | |
inning = 2 if req_rr > 0 else 1 | |
( | |
batting_team, | |
bowling_team, | |
batting_team_enc, | |
bowling_team_enc, | |
) = get_batting_team(soup, status, inning, teams_this_match) | |
req = -9999 | |
if inning == 2: | |
req = int(re.match(r".*need (\d+) runs", status).groups()[0]) | |
print(f"{req=}") | |
else: | |
print("Not chasing so target not set") | |
return ( | |
matchState, | |
score, | |
run_last_5_overs, | |
wkt_last_5_overs, | |
runs, | |
wkts, | |
overs, | |
req_rr, | |
req, | |
crr, | |
format, | |
title, | |
status, | |
batting_team, | |
bowling_team, | |
batting_team_enc, | |
bowling_team_enc, | |
inning, | |
) | |
except BaseException as e: | |
print(traceback.format_exc()) | |
return (str(e),) | |
def get_live_matches(url): | |
if selnium(url) is False: | |
return None | |
soup = BeautifulSoup(open("temp/temp.html", "r").read(), "html.parser") | |
matches = soup.find_all("a", {"class": "cb-mat-mnu-itm cb-ovr-flo"}) | |
return { | |
m.text: urljoin(url, m.get("href")) | |
for m in matches | |
if m not in soup.find_all("a", {"id": "live-scores-link"}) | |
} | |
if __name__ == "__main__": | |
url = "https://cricbuzz.com/live-cricket-scores/79055/wa-vs-saus-3rd-match-australia-domestic-one-day-cup-2023-24" | |
print(scrape(url)) | |
# print(get_live_matches("https://cricbuzz.com")) | |