| | import requests |
| | import json |
| | import wikitextparser as wtp |
| | from lxml import etree |
| | from lxml import html |
| |
|
| | |
| | |
| |
|
| | class MyWikiAPI: |
| | WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php" |
| | WIKI_FEATURED_URL = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log" |
| | user_agent_headers = {"user-Agent": |
| | "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} |
| |
|
| | def __init__(self): |
| | print(f"***KS*** Initializing Wiki API") |
| |
|
| | def __find_section_on_page__(self, page_title, section_title): |
| | response = requests.get( |
| | self.WIKI_BASE_URL, |
| | headers=self.user_agent_headers, |
| | params={ |
| | 'action': 'parse', |
| | 'format': 'json', |
| | 'page': page_title, |
| | 'prop': 'sections', |
| | }).json() |
| | sections = response['parse']['sections'] |
| | section_ix_found = None |
| | for sec in sections: |
| | if sec["line"] == section_title: |
| | section_ix_found = sec["index"] |
| | return section_ix_found |
| |
|
| | def __get_page_section_content__(self, page_title, section_id, format="wikitext"): |
| | response = requests.get( |
| | self.WIKI_BASE_URL, |
| | headers=self.user_agent_headers, |
| | params={ |
| | 'action': 'parse', |
| | 'format': 'json', |
| | 'page': page_title, |
| | 'prop': format, |
| | 'section': section_id |
| | }).json() |
| | return response["parse"][format]["*"] |
| |
|
| | def __get_featured_log__(self, month, year): |
| | featured_url = f"{self.WIKI_FEATURED_URL}/{month}_{year}" |
| | print(f"Getting content for: {featured_url}") |
| | response = requests.get( |
| | featured_url, |
| | headers=self.user_agent_headers, |
| | ).text |
| | return response |
| |
|
| | def __process_featured_log__(self, html_content): |
| | tree = html.fromstring(html_content) |
| | article_heading = "mw-heading mw-heading3" |
| | elements = tree.xpath(f"//div[@class='{article_heading}']") |
| | element_texts = [] |
| | for element in elements: |
| | n1 = element.getnext() |
| | n1_text = " ".join(n1.itertext()) |
| |
|
| | n2 = n1.getnext() |
| | n2_text = " ".join(n2.itertext()) |
| | element_text = f"{n1_text} {n2_text}" |
| |
|
| | element_texts.append(element_text) |
| |
|
| | return element_texts |
| |
|
| | def __is_int__(self, s): |
| | try: |
| | int(s) |
| | except ValueError: |
| | return False |
| | else: |
| | return True |
| |
|
| | def get_category(self, category, year): |
| | ret = self.__get_category_pages__(category, year) |
| | print(f"Got category: {category}\n{ret}") |
| |
|
| | def get_page_section(self, page_title, section_title): |
| | section_id = self.__find_section_on_page__(page_title, section_title) |
| | _ret = "" |
| | if section_id is not None: |
| | _ret = self.__get_page_section_content__(page_title, section_id) |
| | |
| | return _ret |
| |
|
| | def filter_section_and_table(self, section_content, sub_section_name, year_start, year_end): |
| | parsed = wtp.parse(section_content) |
| | sections = parsed.sections |
| | section_found = None |
| | for sec in sections: |
| | if sec.title is not None and sec.title.find(sub_section_name) >= 0: |
| | section_found = sec |
| |
|
| | print(f"Found matching subsection: {section_found.title}") |
| |
|
| | rows_collected = [] |
| | if section_found is not None and section_found.tables is not None and len(section_found.tables) > 0: |
| | table_data = section_found.tables[0].data() |
| | for row in table_data: |
| | if self.__is_int__(row[0]) and year_start <= int(row[0]) <= year_end: |
| | rows_collected.append(row) |
| |
|
| | return rows_collected |
| |
|
| | def get_featured_articles(self, month, year): |
| | |
| | |
| | full_html = self.__get_featured_log__(month, year) |
| |
|
| | ret = self.__process_featured_log__(full_html) |
| |
|
| | |
| | return ret |