File size: 5,296 Bytes
1104bf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Some functions are adapted from https://github.com/yuxiaw/Factcheck-GPT

import concurrent.futures
import requests
import bs4
import re
from typing import List, Tuple
import itertools
import numpy as np
from time import time


def is_tag_visible(element: bs4.element) -> bool:
    """Determines if an HTML element is visible.

    Args:
        element: A BeautifulSoup element to check the visiblity of.
    returns:
        Whether the element is visible.
    """
    if element.parent.name in [
        "style",
        "script",
        "head",
        "title",
        "meta",
        "[document]",
    ] or isinstance(element, bs4.element.Comment):
        return False
    return True


def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
    """Scrapes a URL for all text information.

    Args:
        url: URL of webpage to scrape.
        timeout: Timeout of the requests call.
    Returns:
        web_text: The visible text of the scraped URL.
        url: URL input.
    """
    # Scrape the URL
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
    except requests.exceptions.RequestException as _:
        return None, url

    # Extract out all text from the tags
    try:
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        texts = soup.findAll(string=True)
        # Filter out invisible text from the page.
        visible_text = filter(is_tag_visible, texts)
    except Exception as _:
        return None, url

    # Returns all the text concatenated as a string.
    web_text = " ".join(t.strip() for t in visible_text).strip()
    # Clean up spacing.
    web_text = " ".join(web_text.split())
    return web_text, url


def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
    """Searches the query using Google. 
    Args:
        query: Search query.
        num_web_pages: the number of web pages to request.
        save_url: path to save returned urls, such as 'urls.txt'
    Returns:
        search_results: A list of the top URLs relevant to the query.
    """
    # set headers: Google returns different web-pages according to agent device
    # desktop user-agent
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    headers = {'User-Agent': USER_AGENT}
    
    # set language
    # set the Google interface language, use &hl=XX
    # set the preferred language of the search results, use &lr=lang_XX
    # set language as en, otherwise it will return many translation web pages to Arabic that can't be opened correctly.
    lang = "en" 

    # scrape google results
    urls = []
    for page in range(0, num_web_pages, 10):
        # here page is google search's bottom page meaning, click 2 -> start=10
        # url = "https://www.google.com/search?q={}&start={}".format(query, page)
        url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
        r = requests.get(url, headers=headers, timeout=timeout)
        # collect all urls by regular expression
        # how to do if I just want to have the returned top-k pages?
        urls += re.findall('href="(https?://.*?)"', r.text)

    # set to remove repeated urls
    urls = list(set(urls))

    # save all url into a txt file
    if not save_url == "":
        with open(save_url, 'w') as file:
            for url in urls:
                file.write(url + '\n')
    return urls


def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):

    """
    Orders the documents, scores, and URLs based on the scores in descending order.

    allow_duplicated_urls: 
        - If False, the function will return the highest scored chunk per doc + scores + urls.
        - If True, the function will return all the chunks per doc + scores + urls.
    """

    # Flatten the used_chunk and support_prob_per_chunk lists
    flattened_docs = [doc for chunk in used_chunk for doc in chunk]
    flattened_scores = [score for chunk in support_prob_per_chunk for score in chunk]

    # Create a list of tuples containing the doc, score, and corresponding URL
    doc_score_url = list(zip(flattened_docs, flattened_scores, np.repeat(urls, [len(chunk) for chunk in used_chunk])))

    # Sort the list based on the scores in descending order
    ranked_doc_score_url = sorted(doc_score_url, key=lambda x: x[1], reverse=True)

    # Unzip the sorted list to get the ranked docs, scores, and URLs
    ranked_docs, scores, ranked_urls = zip(*ranked_doc_score_url)

    if allow_duplicated_urls:
        return ranked_docs, scores, ranked_urls
    
    else:

        filtered_docs = []
        filtered_scores = []
        filtered_urls = []
        seen_urls = set()

        for doc, score, url in zip(ranked_docs, scores, ranked_urls):
            if url not in seen_urls:
                filtered_docs.append(doc)
                filtered_scores.append(score)
                filtered_urls.append(url)
                seen_urls.add(url)

        # Update the variables with the filtered results
        ranked_docs = filtered_docs
        scores = filtered_scores
        ranked_urls = filtered_urls

        return ranked_docs, scores, ranked_urls