File size: 5,147 Bytes
9f21f05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import wikipediaapi
from typing import List, Dict
import logging
from dataclasses import dataclass
from datetime import datetime

@dataclass
class WikiSearchResult:
    """Data class to store Wikipedia article information"""
    title: str
    summary: str
    full_text: str
    url: str
    last_modified: datetime
    categories: List[str]

def initialize_wikipedia_client(language: str = 'en', user_agent: str = 'WikipediaSearcher/1.0') -> wikipediaapi.Wikipedia:
    """
    Initialize Wikipedia API client
    
    Args:
        language: Language code (e.g., 'en' for English)
        user_agent: User agent string for API requests
        
    Returns:
        Wikipedia API client instance
    """
    return wikipediaapi.Wikipedia(
        language=language,
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        user_agent=user_agent
    )

def process_page(page: wikipediaapi.WikipediaPage) -> WikiSearchResult:
    """Process a Wikipedia page and extract relevant information"""
    categories = [cat.title for cat in page.categories.values()]
    
    return WikiSearchResult(
        title=page.title,
        summary=page.summary,
        full_text=page.text,
        url=page.fullurl,
        last_modified=datetime.strptime(page.touched, '%Y-%m-%dT%H:%M:%SZ'),
        categories=categories
    )

def search_wikipedia(client: wikipediaapi.Wikipedia, query: str, results_limit: int = 3) -> List[WikiSearchResult]:
    """
    Search Wikipedia and get detailed information for matching articles
    
    Args:
        client: Wikipedia API client instance
        query: Search query string
        results_limit: Maximum number of results to return
        
    Returns:
        List of WikiSearchResult objects containing article information
    """
    try:
        page = client.page(query)
        
        if not page.exists():
            logging.warning(f"No exact match found for: {query}")
            return []

        results = [process_page(page)]

        # Get related pages through links (if we want more results)
        if results_limit > 1:
            for link_title in list(page.links.keys())[:results_limit - 1]:
                link_page = client.page(link_title)
                if link_page.exists():
                    results.append(process_page(link_page))

        return results

    except Exception as e:
        logging.error(f"Error searching Wikipedia: {e}")
        return []

def format_result(result: WikiSearchResult, include_full_text: bool = False) -> str:
    """
    Format a search result for display
    
    Args:
        result: WikiSearchResult object to format
        include_full_text: Whether to include the full article text
        
    Returns:
        Formatted string containing article information
    """
    formatted = f"""
Title: {result.title}
URL: {result.url}
Last Modified: {result.last_modified}
Categories: {', '.join(result.categories[:5])}{'...' if len(result.categories) > 5 else ''}

Summary:
{result.summary}
"""
    if include_full_text:
        formatted += f"\nFull Text:\n{result.full_text}"
        
    return formatted

def get_wiki_data(query: str, results_limit: int = 3) -> List[str]:
    """
    Get Wikipedia data for a given query. If the search returns no results, 
    try using n-grams of decreasing size until a result is found or all attempts fail.

    Args:
        query: Search query string
        results_limit: Maximum number of results to return

    Returns:
        List of summaries from Wikipedia search results, or None if no results are found.
    """
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    client = initialize_wikipedia_client()

    def get_search_result(query):
        """Helper function to get search result summary."""
        result = search_wikipedia(client, query, results_limit)
        if result:
            return result[0].summary  # Return the first result's summary if available
        return None

    # Check the search results with the full query
    summary = get_search_result(query)
    if summary:
        return [summary]

    # If no result, try reducing the query by n-grams
    n = len(query.split())  # Starting with the number of words in the query
    for i in range(n, 1, -1):  # Try from n-grams down to 2-grams
        # Generate n-grams for the current iteration
        n_grams_query = ' '.join(query.split()[:i])
        logging.info(f"Trying n-gram query: {n_grams_query}")
        summary = get_search_result(n_grams_query)
        if summary:
            return [summary]

    # If no results found after all n-gram reductions, return None
    logging.info("No results found for any query variations.")
    return None

# # Example usage
# if __name__ == "__main__":
#     query = "Clash of Clans"
#     results = get_wiki_data(query, results_limit=3)
    
#     if not results:
#         print(f"No results found for query: {query}")
#     else:
#         for idx, result in enumerate(results, 1):
#             print(f"\nResult {idx}:")
#             print("-" * 60)
#             print(format_result(result))