Spaces:
Sleeping
Sleeping
import wikipediaapi | |
from typing import List, Dict | |
import logging | |
from dataclasses import dataclass | |
from datetime import datetime | |
class WikiSearchResult: | |
"""Data class to store Wikipedia article information""" | |
title: str | |
summary: str | |
full_text: str | |
url: str | |
last_modified: datetime | |
categories: List[str] | |
def initialize_wikipedia_client(language: str = 'en', user_agent: str = 'WikipediaSearcher/1.0') -> wikipediaapi.Wikipedia: | |
""" | |
Initialize Wikipedia API client | |
Args: | |
language: Language code (e.g., 'en' for English) | |
user_agent: User agent string for API requests | |
Returns: | |
Wikipedia API client instance | |
""" | |
return wikipediaapi.Wikipedia( | |
language=language, | |
extract_format=wikipediaapi.ExtractFormat.WIKI, | |
user_agent=user_agent | |
) | |
def process_page(page: wikipediaapi.WikipediaPage) -> WikiSearchResult: | |
"""Process a Wikipedia page and extract relevant information""" | |
categories = [cat.title for cat in page.categories.values()] | |
return WikiSearchResult( | |
title=page.title, | |
summary=page.summary, | |
full_text=page.text, | |
url=page.fullurl, | |
last_modified=datetime.strptime(page.touched, '%Y-%m-%dT%H:%M:%SZ'), | |
categories=categories | |
) | |
def search_wikipedia(client: wikipediaapi.Wikipedia, query: str, results_limit: int = 3) -> List[WikiSearchResult]: | |
""" | |
Search Wikipedia and get detailed information for matching articles | |
Args: | |
client: Wikipedia API client instance | |
query: Search query string | |
results_limit: Maximum number of results to return | |
Returns: | |
List of WikiSearchResult objects containing article information | |
""" | |
try: | |
page = client.page(query) | |
if not page.exists(): | |
logging.warning(f"No exact match found for: {query}") | |
return [] | |
results = [process_page(page)] | |
# Get related pages through links (if we want more results) | |
if results_limit > 1: | |
for link_title in list(page.links.keys())[:results_limit - 1]: | |
link_page = client.page(link_title) | |
if link_page.exists(): | |
results.append(process_page(link_page)) | |
return results | |
except Exception as e: | |
logging.error(f"Error searching Wikipedia: {e}") | |
return [] | |
def format_result(result: WikiSearchResult, include_full_text: bool = False) -> str: | |
""" | |
Format a search result for display | |
Args: | |
result: WikiSearchResult object to format | |
include_full_text: Whether to include the full article text | |
Returns: | |
Formatted string containing article information | |
""" | |
formatted = f""" | |
Title: {result.title} | |
URL: {result.url} | |
Last Modified: {result.last_modified} | |
Categories: {', '.join(result.categories[:5])}{'...' if len(result.categories) > 5 else ''} | |
Summary: | |
{result.summary} | |
""" | |
if include_full_text: | |
formatted += f"\nFull Text:\n{result.full_text}" | |
return formatted | |
def get_wiki_data(query: str, results_limit: int = 3) -> List[str]: | |
""" | |
Get Wikipedia data for a given query. If the search returns no results, | |
try using n-grams of decreasing size until a result is found or all attempts fail. | |
Args: | |
query: Search query string | |
results_limit: Maximum number of results to return | |
Returns: | |
List of summaries from Wikipedia search results, or None if no results are found. | |
""" | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
client = initialize_wikipedia_client() | |
def get_search_result(query): | |
"""Helper function to get search result summary.""" | |
result = search_wikipedia(client, query, results_limit) | |
if result: | |
return result[0].summary # Return the first result's summary if available | |
return None | |
# Check the search results with the full query | |
summary = get_search_result(query) | |
if summary: | |
return [summary] | |
# If no result, try reducing the query by n-grams | |
n = len(query.split()) # Starting with the number of words in the query | |
for i in range(n, 1, -1): # Try from n-grams down to 2-grams | |
# Generate n-grams for the current iteration | |
n_grams_query = ' '.join(query.split()[:i]) | |
logging.info(f"Trying n-gram query: {n_grams_query}") | |
summary = get_search_result(n_grams_query) | |
if summary: | |
return [summary] | |
# If no results found after all n-gram reductions, return None | |
logging.info("No results found for any query variations.") | |
return None | |
# # Example usage | |
# if __name__ == "__main__": | |
# query = "Clash of Clans" | |
# results = get_wiki_data(query, results_limit=3) | |
# if not results: | |
# print(f"No results found for query: {query}") | |
# else: | |
# for idx, result in enumerate(results, 1): | |
# print(f"\nResult {idx}:") | |
# print("-" * 60) | |
# print(format_result(result)) | |