Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| import requests | |
| from selenium.webdriver import Chrome | |
| from selenium.webdriver.chrome.options import Options | |
| import os | |
| dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| SAVE_PATH = dir_path + '/prescraped/codeforces/' | |
| scraped_problems = os.listdir(SAVE_PATH + "Problems") | |
| scraped_editorials = os.listdir(SAVE_PATH + "Editorials") | |
| def anti_scrape(soup): | |
| if soup.text == "Just a moment...Enable JavaScript and cookies to continue": | |
| print("Bypassing anti-scrap protection...") | |
| scr = soup.findAll("script")[-1].string | |
| print(scr) | |
| scr = scr[scr.index("var a=toNumbers"):].split(';') | |
| line = scr[0] | |
| abc = [] | |
| while "toNumbers" in line: | |
| i = line.index("toNumbers") | |
| line = line[i+11:] | |
| abc.append(line[:line.index('"')]) | |
| from Crypto.Cipher import AES | |
| def to_numbers(x): | |
| return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2)) | |
| key, iv, cipher = map(to_numbers, abc) | |
| aes = AES.new(key, AES.MODE_CBC, iv) | |
| rcpc = aes.decrypt(cipher).hex() | |
| print(f"RCPC = {rcpc}") | |
| url = scr[-2] | |
| url = url[url.index('"')+1:-1] | |
| r = requests.get(url, cookies={"RCPC": rcpc}) | |
| s = r.text | |
| soup = BeautifulSoup(s, "html.parser") | |
| def read(file_path): | |
| res = "" | |
| with open(file_path, 'r') as f: | |
| res = f.read() | |
| return res | |
| def from_url(url): | |
| pid = url.split('/')[-3:] | |
| pid.remove('problem') | |
| pid = ''.join(pid); | |
| return pid | |
| def problem(url): | |
| pid = from_url(url) | |
| if (pid in scraped_problems): | |
| statement = read(SAVE_PATH + "Problems/" + pid) | |
| if (len(statement)): | |
| return {"statement": statement} | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") # Opens the browser up in background | |
| with Chrome(options=chrome_options) as browser: | |
| browser.get(url) | |
| response = browser.page_source | |
| # response = requests.get(url) | |
| soup = BeautifulSoup(response, 'html.parser') | |
| # anti_scrape(soup) | |
| problem_statements = soup.find_all(class_='problem-statement') | |
| # print(p) | |
| for statement in problem_statements: | |
| # title | |
| title = statement.find(class_='title').text.strip() | |
| # time limit | |
| time_limit = statement.find(class_='time-limit').text.strip().replace('time limit per test', '') | |
| # memory limit | |
| memory_limit = statement.find(class_='memory-limit').text.strip().replace('memory limit per test', '') | |
| # Fails for interactives (and presumably output only's) | |
| # # input format | |
| # input_spec = statement.find(class_='input-specification').text.strip().replace('Input', '') | |
| # # print("Input Specification:", input_spec) | |
| # # output format | |
| # output_spec = statement.find(class_='output-specification').text.strip().replace('Output', '') | |
| # # print("Output Specification:", output_spec) | |
| problem = "" | |
| for child in statement.children: | |
| if child.name == 'div' and ('input-specification' in child.get('class', [])): | |
| break | |
| if child.name == 'div' and ('header' not in child.get('class', [])): | |
| problem += child.text.strip() | |
| # sample inputs | |
| examples = statement.find(class_='sample-tests') | |
| input_tests = examples.find_all(class_='test-example-line') | |
| inputs = "" | |
| for i in input_tests: | |
| input_example_lines = i.text.strip() | |
| inputs += input_example_lines | |
| inputs += '\n' | |
| # sample outputs | |
| output_tests = examples.find_all(class_='output') | |
| outputs = "" | |
| # print(output_tests.text.strip()) | |
| for i in output_tests: | |
| output_example_lines = i.text.strip().replace('Output\n', '') | |
| outputs += output_example_lines | |
| outputs += '\n' | |
| note = statement.find(class_='note') | |
| notes = "" | |
| if note: | |
| note_text = note.text.strip().replace('Note', '') | |
| notes += note_text | |
| # print("Note:", note_text) | |
| data = { | |
| "title": title, | |
| "time_limit": time_limit, | |
| "memory_limit": memory_limit, | |
| # "input_format": input_spec, | |
| # "output_format": output_spec, | |
| "statement": problem, | |
| "sample_input": inputs, | |
| "sample_outputs": outputs, | |
| "note": notes | |
| } | |
| with open(SAVE_PATH + 'Problems/' + pid, 'w') as f: | |
| f.write(data["statement"]) | |
| scraped_problems.append(pid) | |
| return data | |
| def editorial(prob_url, edi_url, bot=None, query_func=None): | |
| pid = from_url(prob_url) | |
| if (pid in scraped_editorials): | |
| edi = read(SAVE_PATH + "Editorials/" + pid) | |
| if (len(edi)): | |
| return edi | |
| domain = 'https://codeforces.com/' | |
| if (domain in prob_url): | |
| prob_url=prob_url[len(domain):] | |
| if ('problemset/problem/' in prob_url): | |
| # print(prob_url) | |
| prob_url=prob_url[len('problemset/problem/'):] | |
| # print(prob_url) | |
| prob_url = 'contest/' + '/problem/'.join(prob_url.split('/')) | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") # Opens the browser up in background | |
| with Chrome(options=chrome_options) as browser: | |
| browser.get(edi_url) | |
| response = browser.page_source | |
| soup = BeautifulSoup(response, 'html.parser') | |
| # anti_scrape(soup) | |
| soup = soup.find_all(class_='content')[0] | |
| while soup.pre != None: # removes all code | |
| soup.pre.decompose() | |
| edi = [] | |
| on = False # checks what problem we're on | |
| contest_url = '/'.join(prob_url.split('/')[:-1]) | |
| for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'center', 'b']): | |
| links = tag.find_all(href=True) | |
| # if (len(links)): | |
| # print(links[0].prettify()) | |
| # print(prob_url) | |
| # print(contest_url) | |
| if (len(links) > 0 and prob_url in links[0].prettify()): | |
| on=True | |
| elif len(links) > 0 and contest_url in links[0].prettify(): | |
| on = False | |
| elif on: | |
| latex_content = "" | |
| for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4 | |
| if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name !='b' and elem.find_parent().name != 'center' and elem.find_parent().name != 'li'): | |
| continue | |
| bold = elem.find_parent().name == 'b' | |
| if isinstance(elem, str): | |
| if ('code' in elem.lower() and bold): | |
| continue | |
| latex_content += '**' * bold + elem + '**' * bold | |
| elif elem.name == "script" and elem.get("type") == "math/tex": | |
| latex_content += "$$$" + elem.string + "$$$" | |
| edi.append(latex_content) | |
| edi = '\n'.join(edi) | |
| # print('bot', bot) | |
| # if (bot): | |
| # edi = bot.chat(query_func(problem(prob_url), edi)) | |
| with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f: | |
| f.write(edi) | |
| scraped_editorials.append(pid) | |
| return edi | |
| # print(problem('https://codeforces.com/problemset/problem/1985/G')) | |