File size: 5,147 Bytes
9f21f05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import wikipediaapi
from typing import List, Dict
import logging
from dataclasses import dataclass
from datetime import datetime
@dataclass
class WikiSearchResult:
"""Data class to store Wikipedia article information"""
title: str
summary: str
full_text: str
url: str
last_modified: datetime
categories: List[str]
def initialize_wikipedia_client(language: str = 'en', user_agent: str = 'WikipediaSearcher/1.0') -> wikipediaapi.Wikipedia:
"""
Initialize Wikipedia API client
Args:
language: Language code (e.g., 'en' for English)
user_agent: User agent string for API requests
Returns:
Wikipedia API client instance
"""
return wikipediaapi.Wikipedia(
language=language,
extract_format=wikipediaapi.ExtractFormat.WIKI,
user_agent=user_agent
)
def process_page(page: wikipediaapi.WikipediaPage) -> WikiSearchResult:
"""Process a Wikipedia page and extract relevant information"""
categories = [cat.title for cat in page.categories.values()]
return WikiSearchResult(
title=page.title,
summary=page.summary,
full_text=page.text,
url=page.fullurl,
last_modified=datetime.strptime(page.touched, '%Y-%m-%dT%H:%M:%SZ'),
categories=categories
)
def search_wikipedia(client: wikipediaapi.Wikipedia, query: str, results_limit: int = 3) -> List[WikiSearchResult]:
"""
Search Wikipedia and get detailed information for matching articles
Args:
client: Wikipedia API client instance
query: Search query string
results_limit: Maximum number of results to return
Returns:
List of WikiSearchResult objects containing article information
"""
try:
page = client.page(query)
if not page.exists():
logging.warning(f"No exact match found for: {query}")
return []
results = [process_page(page)]
# Get related pages through links (if we want more results)
if results_limit > 1:
for link_title in list(page.links.keys())[:results_limit - 1]:
link_page = client.page(link_title)
if link_page.exists():
results.append(process_page(link_page))
return results
except Exception as e:
logging.error(f"Error searching Wikipedia: {e}")
return []
def format_result(result: WikiSearchResult, include_full_text: bool = False) -> str:
"""
Format a search result for display
Args:
result: WikiSearchResult object to format
include_full_text: Whether to include the full article text
Returns:
Formatted string containing article information
"""
formatted = f"""
Title: {result.title}
URL: {result.url}
Last Modified: {result.last_modified}
Categories: {', '.join(result.categories[:5])}{'...' if len(result.categories) > 5 else ''}
Summary:
{result.summary}
"""
if include_full_text:
formatted += f"\nFull Text:\n{result.full_text}"
return formatted
def get_wiki_data(query: str, results_limit: int = 3) -> List[str]:
"""
Get Wikipedia data for a given query. If the search returns no results,
try using n-grams of decreasing size until a result is found or all attempts fail.
Args:
query: Search query string
results_limit: Maximum number of results to return
Returns:
List of summaries from Wikipedia search results, or None if no results are found.
"""
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
client = initialize_wikipedia_client()
def get_search_result(query):
"""Helper function to get search result summary."""
result = search_wikipedia(client, query, results_limit)
if result:
return result[0].summary # Return the first result's summary if available
return None
# Check the search results with the full query
summary = get_search_result(query)
if summary:
return [summary]
# If no result, try reducing the query by n-grams
n = len(query.split()) # Starting with the number of words in the query
for i in range(n, 1, -1): # Try from n-grams down to 2-grams
# Generate n-grams for the current iteration
n_grams_query = ' '.join(query.split()[:i])
logging.info(f"Trying n-gram query: {n_grams_query}")
summary = get_search_result(n_grams_query)
if summary:
return [summary]
# If no results found after all n-gram reductions, return None
logging.info("No results found for any query variations.")
return None
# # Example usage
# if __name__ == "__main__":
# query = "Clash of Clans"
# results = get_wiki_data(query, results_limit=3)
# if not results:
# print(f"No results found for query: {query}")
# else:
# for idx, result in enumerate(results, 1):
# print(f"\nResult {idx}:")
# print("-" * 60)
# print(format_result(result))
|