Spaces:
Running
Running
from bs4 import BeautifulSoup | |
import requests | |
from selenium.webdriver import Chrome | |
from selenium.webdriver.chrome.options import Options | |
import os | |
dir_path = os.path.dirname(os.path.realpath(__file__)) | |
SAVE_PATH = dir_path + '/prescraped/codeforces/' | |
scraped_problems = os.listdir(SAVE_PATH + "Problems") | |
scraped_editorials = os.listdir(SAVE_PATH + "Editorials") | |
def anti_scrape(soup): | |
if soup.text == "Just a moment...Enable JavaScript and cookies to continue": | |
print("Bypassing anti-scrap protection...") | |
scr = soup.findAll("script")[-1].string | |
print(scr) | |
scr = scr[scr.index("var a=toNumbers"):].split(';') | |
line = scr[0] | |
abc = [] | |
while "toNumbers" in line: | |
i = line.index("toNumbers") | |
line = line[i+11:] | |
abc.append(line[:line.index('"')]) | |
from Crypto.Cipher import AES | |
def to_numbers(x): | |
return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2)) | |
key, iv, cipher = map(to_numbers, abc) | |
aes = AES.new(key, AES.MODE_CBC, iv) | |
rcpc = aes.decrypt(cipher).hex() | |
print(f"RCPC = {rcpc}") | |
url = scr[-2] | |
url = url[url.index('"')+1:-1] | |
r = requests.get(url, cookies={"RCPC": rcpc}) | |
s = r.text | |
soup = BeautifulSoup(s, "html.parser") | |
def read(file_path): | |
res = "" | |
with open(file_path, 'r') as f: | |
res = f.read() | |
return res | |
def from_url(url): | |
pid = url.split('/')[-3:] | |
pid.remove('problem') | |
pid = ''.join(pid); | |
return pid | |
def problem(url): | |
pid = from_url(url) | |
if (pid in scraped_problems): | |
statement = read(SAVE_PATH + "Problems/" + pid) | |
if (len(statement)): | |
return {"statement": statement} | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Opens the browser up in background | |
with Chrome(options=chrome_options) as browser: | |
browser.get(url) | |
response = browser.page_source | |
# response = requests.get(url) | |
soup = BeautifulSoup(response, 'html.parser') | |
# anti_scrape(soup) | |
problem_statements = soup.find_all(class_='problem-statement') | |
# print(p) | |
for statement in problem_statements: | |
# title | |
title = statement.find(class_='title').text.strip() | |
# time limit | |
time_limit = statement.find(class_='time-limit').text.strip().replace('time limit per test', '') | |
# memory limit | |
memory_limit = statement.find(class_='memory-limit').text.strip().replace('memory limit per test', '') | |
# Fails for interactives (and presumably output only's) | |
# # input format | |
# input_spec = statement.find(class_='input-specification').text.strip().replace('Input', '') | |
# # print("Input Specification:", input_spec) | |
# # output format | |
# output_spec = statement.find(class_='output-specification').text.strip().replace('Output', '') | |
# # print("Output Specification:", output_spec) | |
problem = "" | |
for child in statement.children: | |
if child.name == 'div' and ('input-specification' in child.get('class', [])): | |
break | |
if child.name == 'div' and ('header' not in child.get('class', [])): | |
problem += child.text.strip() | |
# sample inputs | |
examples = statement.find(class_='sample-tests') | |
input_tests = examples.find_all(class_='test-example-line') | |
inputs = "" | |
for i in input_tests: | |
input_example_lines = i.text.strip() | |
inputs += input_example_lines | |
inputs += '\n' | |
# sample outputs | |
output_tests = examples.find_all(class_='output') | |
outputs = "" | |
# print(output_tests.text.strip()) | |
for i in output_tests: | |
output_example_lines = i.text.strip().replace('Output\n', '') | |
outputs += output_example_lines | |
outputs += '\n' | |
note = statement.find(class_='note') | |
notes = "" | |
if note: | |
note_text = note.text.strip().replace('Note', '') | |
notes += note_text | |
# print("Note:", note_text) | |
data = { | |
"title": title, | |
"time_limit": time_limit, | |
"memory_limit": memory_limit, | |
# "input_format": input_spec, | |
# "output_format": output_spec, | |
"statement": problem, | |
"sample_input": inputs, | |
"sample_outputs": outputs, | |
"note": notes | |
} | |
with open(SAVE_PATH + 'Problems/' + pid, 'w') as f: | |
f.write(data["statement"]) | |
scraped_problems.append(pid) | |
return data | |
def editorial(prob_url, edi_url, bot=None, query_func=None): | |
pid = from_url(prob_url) | |
if (pid in scraped_editorials): | |
edi = read(SAVE_PATH + "Editorials/" + pid) | |
if (len(edi)): | |
return edi | |
domain = 'https://codeforces.com/' | |
if (domain in prob_url): | |
prob_url=prob_url[len(domain):] | |
if ('problemset/problem/' in prob_url): | |
# print(prob_url) | |
prob_url=prob_url[len('problemset/problem/'):] | |
# print(prob_url) | |
prob_url = 'contest/' + '/problem/'.join(prob_url.split('/')) | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Opens the browser up in background | |
with Chrome(options=chrome_options) as browser: | |
browser.get(edi_url) | |
response = browser.page_source | |
soup = BeautifulSoup(response, 'html.parser') | |
# anti_scrape(soup) | |
soup = soup.find_all(class_='content')[0] | |
while soup.pre != None: # removes all code | |
soup.pre.decompose() | |
edi = [] | |
on = False # checks what problem we're on | |
contest_url = '/'.join(prob_url.split('/')[:-1]) | |
for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'center', 'b']): | |
links = tag.find_all(href=True) | |
# if (len(links)): | |
# print(links[0].prettify()) | |
# print(prob_url) | |
# print(contest_url) | |
if (len(links) > 0 and prob_url in links[0].prettify()): | |
on=True | |
elif len(links) > 0 and contest_url in links[0].prettify(): | |
on = False | |
elif on: | |
latex_content = "" | |
for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4 | |
if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name !='b' and elem.find_parent().name != 'center' and elem.find_parent().name != 'li'): | |
continue | |
bold = elem.find_parent().name == 'b' | |
if isinstance(elem, str): | |
if ('code' in elem.lower() and bold): | |
continue | |
latex_content += '**' * bold + elem + '**' * bold | |
elif elem.name == "script" and elem.get("type") == "math/tex": | |
latex_content += "$$$" + elem.string + "$$$" | |
edi.append(latex_content) | |
edi = '\n'.join(edi) | |
# print('bot', bot) | |
# if (bot): | |
# edi = bot.chat(query_func(problem(prob_url), edi)) | |
with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f: | |
f.write(edi) | |
scraped_editorials.append(pid) | |
return edi | |
# print(problem('https://codeforces.com/problemset/problem/1985/G')) | |