Spaces:

RithwikG
/

comp-prog-bot

Sleeping

App Files Files Community

comp-prog-bot / utils /scrapers /codeforces.py

RithwikG

initial commit

7a8878c over 1 year ago

raw

history blame contribute delete

7.5 kB

	from bs4 import BeautifulSoup
	import requests
	from selenium.webdriver import Chrome
	from selenium.webdriver.chrome.options import Options
	import os

	dir_path = os.path.dirname(os.path.realpath(__file__))

	SAVE_PATH = dir_path + '/prescraped/codeforces/'
	scraped_problems = os.listdir(SAVE_PATH + "Problems")
	scraped_editorials = os.listdir(SAVE_PATH + "Editorials")


	def anti_scrape(soup):
	if soup.text == "Just a moment...Enable JavaScript and cookies to continue":
	print("Bypassing anti-scrap protection...")
	scr = soup.findAll("script")[-1].string
	print(scr)
	scr = scr[scr.index("var a=toNumbers"):].split(';')
	line = scr[0]
	abc = []
	while "toNumbers" in line:
	i = line.index("toNumbers")
	line = line[i+11:]
	abc.append(line[:line.index('"')])
	from Crypto.Cipher import AES
	def to_numbers(x):
	return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2))
	key, iv, cipher = map(to_numbers, abc)
	aes = AES.new(key, AES.MODE_CBC, iv)
	rcpc = aes.decrypt(cipher).hex()
	print(f"RCPC = {rcpc}")
	url = scr[-2]
	url = url[url.index('"')+1:-1]
	r = requests.get(url, cookies={"RCPC": rcpc})
	s = r.text
	soup = BeautifulSoup(s, "html.parser")

	def read(file_path):
	res = ""
	with open(file_path, 'r') as f:
	res = f.read()

	return res

	def from_url(url):
	pid = url.split('/')[-3:]
	pid.remove('problem')
	pid = ''.join(pid);
	return pid

	def problem(url):

	pid = from_url(url)

	if (pid in scraped_problems):
	statement = read(SAVE_PATH + "Problems/" + pid)
	if (len(statement)):
	return {"statement": statement}


	chrome_options = Options()
	chrome_options.add_argument("--headless") # Opens the browser up in background

	with Chrome(options=chrome_options) as browser:
	browser.get(url)
	response = browser.page_source

	# response = requests.get(url)

	soup = BeautifulSoup(response, 'html.parser')

	# anti_scrape(soup)

	problem_statements = soup.find_all(class_='problem-statement')

	# print(p)
	for statement in problem_statements:
	# title
	title = statement.find(class_='title').text.strip()

	# time limit
	time_limit = statement.find(class_='time-limit').text.strip().replace('time limit per test', '')

	# memory limit
	memory_limit = statement.find(class_='memory-limit').text.strip().replace('memory limit per test', '')

	# Fails for interactives (and presumably output only's)

	# # input format
	# input_spec = statement.find(class_='input-specification').text.strip().replace('Input', '')
	# # print("Input Specification:", input_spec)

	# # output format
	# output_spec = statement.find(class_='output-specification').text.strip().replace('Output', '')
	# # print("Output Specification:", output_spec)

	problem = ""
	for child in statement.children:
	if child.name == 'div' and ('input-specification' in child.get('class', [])):
	break

	if child.name == 'div' and ('header' not in child.get('class', [])):
	problem += child.text.strip()

	# sample inputs
	examples = statement.find(class_='sample-tests')
	input_tests = examples.find_all(class_='test-example-line')

	inputs = ""
	for i in input_tests:
	input_example_lines = i.text.strip()
	inputs += input_example_lines
	inputs += '\n'

	# sample outputs
	output_tests = examples.find_all(class_='output')
	outputs = ""

	# print(output_tests.text.strip())
	for i in output_tests:
	output_example_lines = i.text.strip().replace('Output\n', '')
	outputs += output_example_lines
	outputs += '\n'

	note = statement.find(class_='note')
	notes = ""

	if note:
	note_text = note.text.strip().replace('Note', '')
	notes += note_text
	# print("Note:", note_text)

	data = {
	"title": title,
	"time_limit": time_limit,
	"memory_limit": memory_limit,
	# "input_format": input_spec,
	# "output_format": output_spec,
	"statement": problem,
	"sample_input": inputs,
	"sample_outputs": outputs,
	"note": notes
	}

	with open(SAVE_PATH + 'Problems/' + pid, 'w') as f:
	f.write(data["statement"])
	scraped_problems.append(pid)

	return data



	def editorial(prob_url, edi_url, bot=None, query_func=None):
	pid = from_url(prob_url)

	if (pid in scraped_editorials):
	edi = read(SAVE_PATH + "Editorials/" + pid)
	if (len(edi)):
	return edi

	domain = 'https://codeforces.com/'
	if (domain in prob_url):
	prob_url=prob_url[len(domain):]
	if ('problemset/problem/' in prob_url):
	# print(prob_url)
	prob_url=prob_url[len('problemset/problem/'):]
	# print(prob_url)
	prob_url = 'contest/' + '/problem/'.join(prob_url.split('/'))


	chrome_options = Options()
	chrome_options.add_argument("--headless") # Opens the browser up in background

	with Chrome(options=chrome_options) as browser:
	browser.get(edi_url)
	response = browser.page_source

	soup = BeautifulSoup(response, 'html.parser')

	# anti_scrape(soup)

	soup = soup.find_all(class_='content')[0]


	while soup.pre != None: # removes all code
	soup.pre.decompose()

	edi = []

	on = False # checks what problem we're on

	contest_url = '/'.join(prob_url.split('/')[:-1])

	for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'center', 'b']):


	links = tag.find_all(href=True)

	# if (len(links)):
	# print(links[0].prettify())
	# print(prob_url)
	# print(contest_url)
	if (len(links) > 0 and prob_url in links[0].prettify()):
	on=True
	elif len(links) > 0 and contest_url in links[0].prettify():
	on = False
	elif on:
	latex_content = ""

	for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4

	if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name !='b' and elem.find_parent().name != 'center' and elem.find_parent().name != 'li'):
	continue

	bold = elem.find_parent().name == 'b'
	if isinstance(elem, str):
	if ('code' in elem.lower() and bold):
	continue

	latex_content += '*' bold + elem + '*' bold

	elif elem.name == "script" and elem.get("type") == "math/tex":
	latex_content += "$$$" + elem.string + "$$$"



	edi.append(latex_content)

	edi = '\n'.join(edi)

	# print('bot', bot)

	# if (bot):
	# edi = bot.chat(query_func(problem(prob_url), edi))


	with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f:
	f.write(edi)
	scraped_editorials.append(pid)

	return edi


	# print(problem('https://codeforces.com/problemset/problem/1985/G'))