Spaces:

RithwikG
/

comp-prog-bot

Running

File size: 7,500 Bytes

7a8878c

from bs4 import BeautifulSoup
import requests
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import os

dir_path = os.path.dirname(os.path.realpath(__file__))

SAVE_PATH = dir_path + '/prescraped/codeforces/'
scraped_problems = os.listdir(SAVE_PATH + "Problems")
scraped_editorials = os.listdir(SAVE_PATH + "Editorials")


def anti_scrape(soup):
    if soup.text == "Just a moment...Enable JavaScript and cookies to continue":
        print("Bypassing anti-scrap protection...")
        scr = soup.findAll("script")[-1].string
        print(scr)
        scr = scr[scr.index("var a=toNumbers"):].split(';')
        line = scr[0]
        abc = []
        while "toNumbers" in line:
            i = line.index("toNumbers")
            line = line[i+11:]
            abc.append(line[:line.index('"')])
        from Crypto.Cipher import AES
        def to_numbers(x):
            return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2))
        key, iv, cipher = map(to_numbers, abc)
        aes = AES.new(key, AES.MODE_CBC, iv)
        rcpc = aes.decrypt(cipher).hex()
        print(f"RCPC = {rcpc}")
        url = scr[-2]
        url = url[url.index('"')+1:-1]
        r = requests.get(url, cookies={"RCPC": rcpc})
        s = r.text
        soup = BeautifulSoup(s, "html.parser")

def read(file_path):
    res = ""
    with open(file_path, 'r') as f:
        res = f.read()

    return res

def from_url(url):
    pid = url.split('/')[-3:]
    pid.remove('problem')
    pid = ''.join(pid);
    return pid

def problem(url):

    pid = from_url(url)

    if (pid in scraped_problems):
        statement = read(SAVE_PATH + "Problems/" + pid)
        if (len(statement)):
            return {"statement": statement}


    chrome_options = Options()  
    chrome_options.add_argument("--headless") # Opens the browser up in background

    with Chrome(options=chrome_options) as browser:
        browser.get(url)
        response = browser.page_source

    # response = requests.get(url)

    soup = BeautifulSoup(response, 'html.parser')

    # anti_scrape(soup)

    problem_statements = soup.find_all(class_='problem-statement')

    # print(p)
    for statement in problem_statements:
        # title
        title = statement.find(class_='title').text.strip()
        
        # time limit
        time_limit = statement.find(class_='time-limit').text.strip().replace('time limit per test', '')
        
        # memory limit
        memory_limit = statement.find(class_='memory-limit').text.strip().replace('memory limit per test', '')
        
        # Fails for interactives (and presumably output only's) 

        # # input format
        # input_spec = statement.find(class_='input-specification').text.strip().replace('Input', '')
        # # print("Input Specification:", input_spec)
        
        # # output format
        # output_spec = statement.find(class_='output-specification').text.strip().replace('Output', '')
        # # print("Output Specification:", output_spec)
        
        problem = ""
        for child in statement.children:
            if child.name == 'div' and ('input-specification' in child.get('class', [])):
                break

            if child.name == 'div' and ('header' not in child.get('class', [])):
                problem += child.text.strip()

        # sample inputs
        examples = statement.find(class_='sample-tests')
        input_tests = examples.find_all(class_='test-example-line')

        inputs = ""
        for i in input_tests: 
            input_example_lines = i.text.strip()
            inputs += input_example_lines
            inputs += '\n'
        
        # sample outputs
        output_tests = examples.find_all(class_='output')    
        outputs = ""

        # print(output_tests.text.strip())
        for i in output_tests:  
            output_example_lines = i.text.strip().replace('Output\n', '')
            outputs += output_example_lines
            outputs += '\n'
        
        note = statement.find(class_='note')
        notes = ""

        if note:
            note_text = note.text.strip().replace('Note', '')
            notes += note_text
            # print("Note:", note_text)
        
        data = {
            "title": title,
            "time_limit": time_limit,
            "memory_limit": memory_limit,
            # "input_format": input_spec,
            # "output_format": output_spec,
            "statement": problem,
            "sample_input": inputs,   
            "sample_outputs": outputs,
            "note": notes
        }

        with open(SAVE_PATH + 'Problems/' + pid, 'w') as f:
            f.write(data["statement"])
        scraped_problems.append(pid)

    return data



def editorial(prob_url, edi_url, bot=None, query_func=None):
    pid = from_url(prob_url)

    if (pid in scraped_editorials):
        edi = read(SAVE_PATH + "Editorials/" + pid)
        if (len(edi)):
            return edi

    domain = 'https://codeforces.com/'
    if (domain in prob_url):
        prob_url=prob_url[len(domain):]
    if ('problemset/problem/' in prob_url):
        # print(prob_url)
        prob_url=prob_url[len('problemset/problem/'):]
        # print(prob_url)
        prob_url = 'contest/' + '/problem/'.join(prob_url.split('/'))


    chrome_options = Options()  
    chrome_options.add_argument("--headless") # Opens the browser up in background

    with Chrome(options=chrome_options) as browser:
        browser.get(edi_url)
        response = browser.page_source

    soup = BeautifulSoup(response, 'html.parser')

    # anti_scrape(soup)

    soup = soup.find_all(class_='content')[0]


    while soup.pre != None: # removes all code
        soup.pre.decompose()

    edi = []

    on = False # checks what problem we're on

    contest_url = '/'.join(prob_url.split('/')[:-1])

    for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'center', 'b']):

        
        links = tag.find_all(href=True)

        # if (len(links)):
        #     print(links[0].prettify())
        #     print(prob_url)
        #     print(contest_url)
        if (len(links) > 0 and prob_url in links[0].prettify()):
            on=True
        elif len(links) > 0 and contest_url in links[0].prettify():
            on = False
        elif on:
            latex_content = ""

            for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4

                if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name !='b' and elem.find_parent().name != 'center'  and elem.find_parent().name != 'li'):
                    continue

                bold = elem.find_parent().name == 'b'
                if isinstance(elem, str):   
                    if ('code' in elem.lower() and bold):
                        continue
        
                    latex_content += '**' * bold + elem +  '**' * bold

                elif elem.name == "script" and elem.get("type") == "math/tex":
                    latex_content += "$$$" + elem.string + "$$$"

                

            edi.append(latex_content)

    edi = '\n'.join(edi)

    # print('bot', bot)

    # if (bot):
    #     edi = bot.chat(query_func(problem(prob_url), edi))


    with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f:
        f.write(edi)
    scraped_editorials.append(pid)

    return edi


# print(problem('https://codeforces.com/problemset/problem/1985/G'))