File size: 4,249 Bytes
7672dcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from typing import Any

import requests
from markdownify import markdownify
from pydantic import BaseModel, Field


class WikipediaOpensearchInput(BaseModel):
    query: str = Field(
        description="The search term or keyword to look up on English Wikipedia."
    )


class GetPageTitleExcerptSectionsInput(BaseModel):
    page_title: str = Field(description="The exact title of the Wikipedia page.")


class GetPageSectionContentInput(BaseModel):
    page_title: str = Field(description="The exact Wikipedia article title.")
    section_index: str = Field("The index of the section (from section metadata).")


class SearchWikipediaEn:
    @staticmethod
    def wikipedia_opensearch(query: str) -> str:
        """
        Searches for Wikipedia articles matching the given query using the OpenSearch API.

        Args:
            query (str): The search term or keyword to look up on English Wikipedia.

        Returns:
            str: JSON-list with items including matched titles, descriptions, and URLs.
                Example:
                [
                    'Python programming',
                    ['Python (programming language)', ... ],
                    ['...', ...],
                    ['https://en.wikipedia.org/wiki/Python_(programming_language)', ...]
                ]
        """

        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "opensearch",
            "namespace": "0",
            "search": query,
            "limit": 10,
            "format": "json",
        }

        response = requests.get(url=url, params=params)

        return response.json()

    @staticmethod
    def get_page_title_excerpt_sections(page_title: str) -> dict[str, Any]:
        """
        Retrieves the title, summary excerpt (plain text), and section structure of a Wikipedia article.

        Args:
            page_title (str): The exact title of the Wikipedia page.

        Returns:
            dict: {
                'excerpt': (str) Plain text summary/excerpt of the page,
                'sections': (list) List of dictionaries describing section metadata.
            }
            Example:
            {
                "excerpt": "Python is a high-level programming language...",
                "sections": [
                    {"toclevel": 1, "level": "2", "line": "History", "index": "1", ...},
                    ...
                ]
            }
        """
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "prop": "extracts",
            "titles": page_title,
            "explaintext": True,
            "format": "json",
            "formatversion": 2,
        }
        req = requests.get(url, params=params)

        page_title = req.json()["query"]["pages"][0]["title"]
        page_excerpt = req.json()["query"]["pages"][0]["extract"]

        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "parse",
            "prop": "sections",
            "page": page_title,
            "format": "json",
        }
        rsp = requests.get(url, params=params)
        page_sections = rsp.json()["parse"]["sections"]

        return {
            "excerpt": page_excerpt,
            "sections": page_sections,
        }

    @staticmethod
    def get_page_section_content(page_title: str, section_index: str) -> str:
        """
        Fetches the Markdown-formatted content of a specific section from a Wikipedia article.

        Args:
            page_title (str): The exact Wikipedia article title.
            section_index (str): The index of the section (from section metadata).

        Returns:
            str: Markdown-formatted content of the specified section.
            Example:
                '# History\nPython was conceived in the late 1980s...'
        """
        url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "parse",
            "page": page_title,
            "format": "json",
            "prop": "text",
        }
        if section_index:
            params["section"] = str(section_index)
        rsp = requests.get(url, params=params)

        return markdownify(rsp.json()["parse"]["text"]["*"])