Spaces:
Sleeping
Sleeping
from typing import Any | |
import requests | |
from markdownify import markdownify | |
from pydantic import BaseModel, Field | |
class WikipediaOpensearchInput(BaseModel): | |
query: str = Field( | |
description="The search term or keyword to look up on English Wikipedia." | |
) | |
class GetPageTitleExcerptSectionsInput(BaseModel): | |
page_title: str = Field(description="The exact title of the Wikipedia page.") | |
class GetPageSectionContentInput(BaseModel): | |
page_title: str = Field(description="The exact Wikipedia article title.") | |
section_index: str = Field("The index of the section (from section metadata).") | |
class SearchWikipediaEn: | |
def wikipedia_opensearch(query: str) -> str: | |
""" | |
Searches for Wikipedia articles matching the given query using the OpenSearch API. | |
Args: | |
query (str): The search term or keyword to look up on English Wikipedia. | |
Returns: | |
str: JSON-list with items including matched titles, descriptions, and URLs. | |
Example: | |
[ | |
'Python programming', | |
['Python (programming language)', ... ], | |
['...', ...], | |
['https://en.wikipedia.org/wiki/Python_(programming_language)', ...] | |
] | |
""" | |
url = "https://en.wikipedia.org/w/api.php" | |
params = { | |
"action": "opensearch", | |
"namespace": "0", | |
"search": query, | |
"limit": 10, | |
"format": "json", | |
} | |
response = requests.get(url=url, params=params) | |
return response.json() | |
def get_page_title_excerpt_sections(page_title: str) -> dict[str, Any]: | |
""" | |
Retrieves the title, summary excerpt (plain text), and section structure of a Wikipedia article. | |
Args: | |
page_title (str): The exact title of the Wikipedia page. | |
Returns: | |
dict: { | |
'excerpt': (str) Plain text summary/excerpt of the page, | |
'sections': (list) List of dictionaries describing section metadata. | |
} | |
Example: | |
{ | |
"excerpt": "Python is a high-level programming language...", | |
"sections": [ | |
{"toclevel": 1, "level": "2", "line": "History", "index": "1", ...}, | |
... | |
] | |
} | |
""" | |
url = "https://en.wikipedia.org/w/api.php" | |
params = { | |
"action": "query", | |
"prop": "extracts", | |
"titles": page_title, | |
"explaintext": True, | |
"format": "json", | |
"formatversion": 2, | |
} | |
req = requests.get(url, params=params) | |
page_title = req.json()["query"]["pages"][0]["title"] | |
page_excerpt = req.json()["query"]["pages"][0]["extract"] | |
url = "https://en.wikipedia.org/w/api.php" | |
params = { | |
"action": "parse", | |
"prop": "sections", | |
"page": page_title, | |
"format": "json", | |
} | |
rsp = requests.get(url, params=params) | |
page_sections = rsp.json()["parse"]["sections"] | |
return { | |
"excerpt": page_excerpt, | |
"sections": page_sections, | |
} | |
def get_page_section_content(page_title: str, section_index: str) -> str: | |
""" | |
Fetches the Markdown-formatted content of a specific section from a Wikipedia article. | |
Args: | |
page_title (str): The exact Wikipedia article title. | |
section_index (str): The index of the section (from section metadata). | |
Returns: | |
str: Markdown-formatted content of the specified section. | |
Example: | |
'# History\nPython was conceived in the late 1980s...' | |
""" | |
url = "https://en.wikipedia.org/w/api.php" | |
params = { | |
"action": "parse", | |
"page": page_title, | |
"format": "json", | |
"prop": "text", | |
} | |
if section_index: | |
params["section"] = str(section_index) | |
rsp = requests.get(url, params=params) | |
return markdownify(rsp.json()["parse"]["text"]["*"]) | |