| | import requests |
| | from lxml import html |
| | from html_to_markdown import convert_to_markdown |
| |
|
| | |
| | |
| |
|
| | class MyLibreTextsAPI: |
| | LIBRETEXTS_BASE_URL = "https://chem.libretexts.org" |
| | user_agent_headers = {"user-Agent": |
| | "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} |
| |
|
| | def __init__(self): |
| | print(f"***KS*** Initializing LibreTexts API") |
| |
|
| | def get_bookshelves(self): |
| | html_content = requests.get( |
| | f"{self.LIBRETEXTS_BASE_URL}/Bookshelves", |
| | headers=self.user_agent_headers, |
| | ).text |
| | tree = html.fromstring(html_content) |
| | link_class = "mt-sortable-listing-link mt-edit-section internal" |
| | elements = tree.xpath(f"//a[@class='{link_class}']") |
| | |
| | |
| | |
| |
|
| | bookshelves = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
| | |
| |
|
| | return bookshelves |
| |
|
| | def get_books(self, bookshelf_url): |
| | html_content = requests.get( |
| | bookshelf_url, |
| | headers=self.user_agent_headers, |
| | ).text |
| | tree = html.fromstring(html_content) |
| | link_class = "mt-sortable-listing-link mt-edit-section internal" |
| | elements = tree.xpath(f"//a[@class='{link_class}']") |
| | |
| | |
| | |
| |
|
| | books = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
| | |
| |
|
| | return books |
| |
|
| | def get_book_sections(self, book_url): |
| | html_content = requests.get( |
| | book_url, |
| | headers=self.user_agent_headers, |
| | ).text |
| | tree = html.fromstring(html_content) |
| | link_class = "mt-sortable-listing-link mt-edit-section internal" |
| | elements = tree.xpath(f"//a[@class='{link_class}']") |
| | |
| | |
| | |
| |
|
| | sections = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
| | |
| |
|
| | return sections |
| |
|
| | def get_book_section_paragraphs(self, section_url): |
| | html_content = requests.get( |
| | section_url, |
| | headers=self.user_agent_headers, |
| | ).text |
| | tree = html.fromstring(html_content) |
| | link_class = "internal" |
| | elements = tree.xpath(f"//a[@class='{link_class}']") |
| | |
| | |
| | |
| |
|
| | paragraphs = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
| | |
| |
|
| | return paragraphs |
| |
|
| | def get_paragraph_contents(self, paragraph_url): |
| | html_content = requests.get( |
| | paragraph_url, |
| | headers=self.user_agent_headers, |
| | ).text |
| |
|
| | markdown = convert_to_markdown(html_content) |
| |
|
| | return markdown |
| |
|
| |
|