File size: 5,296 Bytes

1104bf8

# Some functions are adapted from https://github.com/yuxiaw/Factcheck-GPT

import concurrent.futures
import requests
import bs4
import re
from typing import List, Tuple
import itertools
import numpy as np
from time import time


def is_tag_visible(element: bs4.element) -> bool:
    """Determines if an HTML element is visible.

    Args:
        element: A BeautifulSoup element to check the visiblity of.
    returns:
        Whether the element is visible.
    """
    if element.parent.name in [
        "style",
        "script",
        "head",
        "title",
        "meta",
        "[document]",
    ] or isinstance(element, bs4.element.Comment):
        return False
    return True


def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
    """Scrapes a URL for all text information.

    Args:
        url: URL of webpage to scrape.
        timeout: Timeout of the requests call.
    Returns:
        web_text: The visible text of the scraped URL.
        url: URL input.
    """
    # Scrape the URL
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
    except requests.exceptions.RequestException as _:
        return None, url

    # Extract out all text from the tags
    try:
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        texts = soup.findAll(string=True)
        # Filter out invisible text from the page.
        visible_text = filter(is_tag_visible, texts)
    except Exception as _:
        return None, url

    # Returns all the text concatenated as a string.
    web_text = " ".join(t.strip() for t in visible_text).strip()
    # Clean up spacing.
    web_text = " ".join(web_text.split())
    return web_text, url


def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
    """Searches the query using Google. 
    Args:
        query: Search query.
        num_web_pages: the number of web pages to request.
        save_url: path to save returned urls, such as 'urls.txt'
    Returns:
        search_results: A list of the top URLs relevant to the query.
    """
    # set headers: Google returns different web-pages according to agent device
    # desktop user-agent
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    headers = {'User-Agent': USER_AGENT}
    
    # set language
    # set the Google interface language, use &hl=XX
    # set the preferred language of the search results, use &lr=lang_XX
    # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
    lang = "en" 

    # scrape google results
    urls = []
    for page in range(0, num_web_pages, 10):
        # here page is google search's bottom page meaning, click 2 -> start=10
        # url = "https://www.google.com/search?q={}&start={}".format(query, page)
        url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
        r = requests.get(url, headers=headers, timeout=timeout)
        # collect all urls by regular expression
        # how to do if I just want to have the returned top-k pages?
        urls += re.findall('href="(https?://.*?)"', r.text)

    # set to remove repeated urls
    urls = list(set(urls))

    # save all url into a txt file
    if not save_url == "":
        with open(save_url, 'w') as file:
            for url in urls:
                file.write(url + '\n')
    return urls


def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):

    """
    Orders the documents, scores, and URLs based on the scores in descending order.

    allow_duplicated_urls: 
        - If False, the function will return the highest scored chunk per doc + scores + urls.
        - If True, the function will return all the chunks per doc + scores + urls.
    """

    # Flatten the used_chunk and support_prob_per_chunk lists
    flattened_docs = [doc for chunk in used_chunk for doc in chunk]
    flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]

    # Create a list of tuples containing the doc, score, and corresponding URL
    doc_score_url = list(zip(flattened_docs, flattened_scores, np.repeat(urls, [len(chunk) for chunk in used_chunk])))

    # Sort the list based on the scores in descending order
    ranked_doc_score_url = sorted(doc_score_url, key=lambda x: x[1], reverse=True)

    # Unzip the sorted list to get the ranked docs, scores, and URLs
    ranked_docs, scores, ranked_urls = zip(*ranked_doc_score_url)

    if allow_duplicated_urls:
        return ranked_docs, scores, ranked_urls
    
    else:

        filtered_docs = []
        filtered_scores = []
        filtered_urls = []
        seen_urls = set()

        for doc, score, url in zip(ranked_docs, scores, ranked_urls):
            if url not in seen_urls:
                filtered_docs.append(doc)
                filtered_scores.append(score)
                filtered_urls.append(url)
                seen_urls.add(url)

        # Update the variables with the filtered results
        ranked_docs = filtered_docs
        scores = filtered_scores
        ranked_urls = filtered_urls

        return ranked_docs, scores, ranked_urls