# Some functions are adapted from https://github.com/yuxiaw/Factcheck-GPT import concurrent.futures import requests import bs4 import re from typing import List, Tuple import itertools import numpy as np from time import time def is_tag_visible(element: bs4.element) -> bool: """Determines if an HTML element is visible. Args: element: A BeautifulSoup element to check the visiblity of. returns: Whether the element is visible. """ if element.parent.name in [ "style", "script", "head", "title", "meta", "[document]", ] or isinstance(element, bs4.element.Comment): return False return True def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]: """Scrapes a URL for all text information. Args: url: URL of webpage to scrape. timeout: Timeout of the requests call. Returns: web_text: The visible text of the scraped URL. url: URL input. """ # Scrape the URL try: response = requests.get(url, timeout=timeout) response.raise_for_status() except requests.exceptions.RequestException as _: return None, url # Extract out all text from the tags try: soup = bs4.BeautifulSoup(response.text, "html.parser") texts = soup.findAll(string=True) # Filter out invisible text from the page. visible_text = filter(is_tag_visible, texts) except Exception as _: return None, url # Returns all the text concatenated as a string. web_text = " ".join(t.strip() for t in visible_text).strip() # Clean up spacing. web_text = " ".join(web_text.split()) return web_text, url def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]: """Searches the query using Google. Args: query: Search query. num_web_pages: the number of web pages to request. save_url: path to save returned urls, such as 'urls.txt' Returns: search_results: A list of the top URLs relevant to the query. """ # set headers: Google returns different web-pages according to agent device # desktop user-agent USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0" headers = {'User-Agent': USER_AGENT} # set language # set the Google interface language, use &hl=XX # set the preferred language of the search results, use &lr=lang_XX # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly. lang = "en" # scrape google results urls = [] for page in range(0, num_web_pages, 10): # here page is google search's bottom page meaning, click 2 -> start=10 # url = "https://www.google.com/search?q={}&start={}".format(query, page) url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page) r = requests.get(url, headers=headers, timeout=timeout) # collect all urls by regular expression # how to do if I just want to have the returned top-k pages? urls += re.findall('href="(https?://.*?)"', r.text) # set to remove repeated urls urls = list(set(urls)) # save all url into a txt file if not save_url == "": with open(save_url, 'w') as file: for url in urls: file.write(url + '\n') return urls def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False): """ Orders the documents, scores, and URLs based on the scores in descending order. allow_duplicated_urls: - If False, the function will return the highest scored chunk per doc + scores + urls. - If True, the function will return all the chunks per doc + scores + urls. """ # Flatten the used_chunk and support_prob_per_chunk lists flattened_docs = [doc for chunk in used_chunk for doc in chunk] flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk] # Create a list of tuples containing the doc, score, and corresponding URL doc_score_url = list(zip(flattened_docs, flattened_scores, np.repeat(urls, [len(chunk) for chunk in used_chunk]))) # Sort the list based on the scores in descending order ranked_doc_score_url = sorted(doc_score_url, key=lambda x: x[1], reverse=True) # Unzip the sorted list to get the ranked docs, scores, and URLs ranked_docs, scores, ranked_urls = zip(*ranked_doc_score_url) if allow_duplicated_urls: return ranked_docs, scores, ranked_urls else: filtered_docs = [] filtered_scores = [] filtered_urls = [] seen_urls = set() for doc, score, url in zip(ranked_docs, scores, ranked_urls): if url not in seen_urls: filtered_docs.append(doc) filtered_scores.append(score) filtered_urls.append(url) seen_urls.add(url) # Update the variables with the filtered results ranked_docs = filtered_docs scores = filtered_scores ranked_urls = filtered_urls return ranked_docs, scores, ranked_urls