Spaces:
Sleeping
Sleeping
File size: 4,249 Bytes
7672dcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from typing import Any
import requests
from markdownify import markdownify
from pydantic import BaseModel, Field
class WikipediaOpensearchInput(BaseModel):
query: str = Field(
description="The search term or keyword to look up on English Wikipedia."
)
class GetPageTitleExcerptSectionsInput(BaseModel):
page_title: str = Field(description="The exact title of the Wikipedia page.")
class GetPageSectionContentInput(BaseModel):
page_title: str = Field(description="The exact Wikipedia article title.")
section_index: str = Field("The index of the section (from section metadata).")
class SearchWikipediaEn:
@staticmethod
def wikipedia_opensearch(query: str) -> str:
"""
Searches for Wikipedia articles matching the given query using the OpenSearch API.
Args:
query (str): The search term or keyword to look up on English Wikipedia.
Returns:
str: JSON-list with items including matched titles, descriptions, and URLs.
Example:
[
'Python programming',
['Python (programming language)', ... ],
['...', ...],
['https://en.wikipedia.org/wiki/Python_(programming_language)', ...]
]
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "opensearch",
"namespace": "0",
"search": query,
"limit": 10,
"format": "json",
}
response = requests.get(url=url, params=params)
return response.json()
@staticmethod
def get_page_title_excerpt_sections(page_title: str) -> dict[str, Any]:
"""
Retrieves the title, summary excerpt (plain text), and section structure of a Wikipedia article.
Args:
page_title (str): The exact title of the Wikipedia page.
Returns:
dict: {
'excerpt': (str) Plain text summary/excerpt of the page,
'sections': (list) List of dictionaries describing section metadata.
}
Example:
{
"excerpt": "Python is a high-level programming language...",
"sections": [
{"toclevel": 1, "level": "2", "line": "History", "index": "1", ...},
...
]
}
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"prop": "extracts",
"titles": page_title,
"explaintext": True,
"format": "json",
"formatversion": 2,
}
req = requests.get(url, params=params)
page_title = req.json()["query"]["pages"][0]["title"]
page_excerpt = req.json()["query"]["pages"][0]["extract"]
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "parse",
"prop": "sections",
"page": page_title,
"format": "json",
}
rsp = requests.get(url, params=params)
page_sections = rsp.json()["parse"]["sections"]
return {
"excerpt": page_excerpt,
"sections": page_sections,
}
@staticmethod
def get_page_section_content(page_title: str, section_index: str) -> str:
"""
Fetches the Markdown-formatted content of a specific section from a Wikipedia article.
Args:
page_title (str): The exact Wikipedia article title.
section_index (str): The index of the section (from section metadata).
Returns:
str: Markdown-formatted content of the specified section.
Example:
'# History\nPython was conceived in the late 1980s...'
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "parse",
"page": page_title,
"format": "json",
"prop": "text",
}
if section_index:
params["section"] = str(section_index)
rsp = requests.get(url, params=params)
return markdownify(rsp.json()["parse"]["text"]["*"])
|