File size: 2,495 Bytes
ef3de03
 
 
 
0f6452f
ef3de03
0f6452f
 
ef3de03
 
 
 
 
 
0f6452f
 
 
 
 
 
 
ef3de03
 
 
 
 
0f6452f
 
 
 
 
 
 
 
 
 
ef3de03
0f6452f
ef3de03
 
f150f6b
 
 
 
 
ef3de03
0f6452f
f150f6b
ef3de03
 
 
0f6452f
f150f6b
0f6452f
 
 
 
 
 
ef3de03
 
 
 
 
 
 
0f6452f
ef3de03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from bs4 import BeautifulSoup
from pathlib import Path


class QueryResultsExtractor:
    def __init__(self) -> None:
        self.query_results = []
        self.related_questions = []

    def load_html(self, html_path):
        with open(html_path, "r", encoding="utf-8") as f:
            html = f.read()
        self.soup = BeautifulSoup(html, "html.parser")

    def extract_query_results(self):
        self.query = self.soup.find("textarea").text.strip()
        query_result_elements = self.soup.find_all("div", class_="g")
        for idx, result in enumerate(query_result_elements):
            site = result.find("cite").find_previous("span").text.strip()
            url = result.find("a")["href"]
            title = result.find("h3").text.strip()

            abstract_element = result.find("div", {"data-sncf": "1"})
            if abstract_element is None:
                abstract_element = result.find("div", class_="ITZIwc")
            abstract = abstract_element.text.strip()
            print(f"{title}\n" f"  - {site}\n" f"  - {url}\n" f"  - {abstract}\n" f"\n")
            self.query_results.append(
                {
                    "title": title,
                    "site": site,
                    "url": url,
                    "abstract": abstract,
                    "index": idx,
                    "type": "web",
                }
            )
        print(len(query_result_elements))

    def extract_related_questions(self):
        related_question_elements = self.soup.find_all(
            "div", class_="related-question-pair"
        )
        for question_element in related_question_elements:
            question = question_element.find("span").text.strip()
            print(question)
            self.related_questions.append(question)
        print(len(related_question_elements))

    def extract(self, html_path):
        self.load_html(html_path)
        self.extract_query_results()
        self.extract_related_questions()
        self.search_results = {
            "query": self.query,
            "query_results": self.query_results,
            "related_questions": self.related_questions,
        }
        return self.search_results


if __name__ == "__main__":
    html_path_root = Path(__file__).parents[1] / "files"
    # html_filename = "python教程"
    html_filename = "python_tutorials"
    html_path = html_path_root / f"{html_filename}.html"
    extractor = QueryResultsExtractor()
    extractor.extract(html_path)