Spaces:

RithwikG
/

comp-prog-bot

Sleeping

App Files Files Community

comp-prog-bot / utils /scrapers /usaco.py

RithwikG

initial commit

7a8878c 11 months ago

raw

history blame contribute delete

3.88 kB

	from bs4 import BeautifulSoup
	import requests
	from selenium.webdriver import Chrome
	from selenium.webdriver.chrome.options import Options
	import os

	dir_path = os.path.dirname(os.path.realpath(__file__))

	SAVE_PATH = dir_path + '/prescraped/usaco/'
	scraped_problems = os.listdir(SAVE_PATH + "Problems")
	scraped_editorials = os.listdir(SAVE_PATH + "Editorials")


	def anti_scrape(soup):
	if soup.text == "Just a moment...Enable JavaScript and cookies to continue":
	print("Bypassing anti-scrap protection...")
	scr = soup.findAll("script")[-1].string
	scr = scr[scr.index("var a=toNumbers"):].split(';')
	line = scr[0]
	abc = []
	while "toNumbers" in line:
	i = line.index("toNumbers")
	line = line[i+11:]
	abc.append(line[:line.index('"')])
	from Crypto.Cipher import AES
	def to_numbers(x):
	return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2))
	key, iv, cipher = map(to_numbers, abc)
	aes = AES.new(key, AES.MODE_CBC, iv)
	rcpc = aes.decrypt(cipher).hex()
	print(f"RCPC = {rcpc}")
	url = scr[-2]
	url = url[url.index('"')+1:-1]
	r = requests.get(url, cookies={"RCPC": rcpc})
	s = r.text
	soup = BeautifulSoup(s, "html.parser")

	def read(file_path):
	res = ""
	with open(file_path, 'r') as f:
	res = f.read()

	return res

	def from_url(url):
	return url.split('/')[-1]

	def problem(url):
	pid = from_url(url)
	if (pid in scraped_problems):
	statement = read(SAVE_PATH + "Problems/" + pid)
	if (len(statement)):
	return {"statement": statement}

	response = requests.get(url)

	soup = BeautifulSoup(response.text, 'html.parser')

	soup = soup.find_all(class_='problem-text')[0]

	while soup.pre != None: # removes all code
	soup.pre.decompose()



	prob = soup.text

	prob = prob.split("SAMPLE INPUT")

	prob[-1] = prob[-1].split("SCORING:")


	prob = prob[0] + "SCORING:" + prob[-1][-1]

	with open(SAVE_PATH + 'Problems/' + pid, 'w') as f:
	f.write(prob)
	scraped_problems.append(pid)

	return {"statement": prob}



	def editorial(prob_url, edi_url, bot=None, query_func=None): # TODO: Fix random line breaks in the scrapes
	pid = from_url(edi_url)
	print(pid, scraped_editorials)
	if (pid in scraped_editorials):
	edi = read(SAVE_PATH + "Editorials/" + pid)
	if (len(edi)):
	return edi



	response = requests.get(edi_url)

	soup = BeautifulSoup(response.text, 'html.parser')

	while soup.pre != None: # removes all code
	soup.pre.decompose()

	edi = []

	for tag in soup.find_all(['p']):
	if (tag.parent.name != 'body'):
	continue

	latex_content = tag.text

	# print(tag.parent.name)

	# for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4

	# if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name != 'center'):
	# continue

	# if isinstance(elem, str):
	# latex_content += elem
	# elif elem.name == "script" and elem.get("type") == "math/tex":
	# latex_content += "$$$" + elem.string + "$$$"


	# if ("code:" in latex_content.lower()):
	# continue
	edi.append(latex_content)

	edi = '\n'.join(edi)

	# print('bot', bot)

	# if (bot):
	# edi = bot.chat(query_func(problem(prob_url), edi))


	with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f:
	f.write(edi)
	scraped_editorials.append(pid)

	return edi



	# print(editorial('https://usaco.org/current/data/sol_prob2_platinum_open24.html'))
	# print(problem('https://usaco.org/index.php?page=viewproblem2&cpid=1428')['statement'])