Final_Assignment_Template_alisa

Running

App Files Files Community

alisamak commited on May 6

Commit

00b8b9c

verified ·

1 Parent(s): 1abcd94

Update tools.py

Browse files

Files changed (1) hide show

tools.py +181 -93

tools.py CHANGED Viewed

@@ -20,6 +20,91 @@ import re
 from datetime import datetime, timedelta
 from langchain_core.tools import tool
 @tool
 def categorize_grocery_items(items: list[str]) -> dict:
     """
@@ -64,53 +149,53 @@ def categorize_grocery_items(items: list[str]) -> dict:
     return result
-@tool
-def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
-    """
-    Searches the English Wikipedia featured article archive and returns article titles
-    promoted between start_date and end_date.
-    Args:
-        start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
-        end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
-    Returns:
-        list[str]: A list of article titles promoted as Featured Articles during that period.
-    """
-    print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
-    try:
-        base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
-        archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
-        start = datetime.strptime(start_date, "%Y-%m-%d")
-        end = datetime.strptime(end_date, "%Y-%m-%d")
-        # We'll collect year-specific pages
-        result_titles = []
-        for year in range(start.year, end.year + 1):
-            url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
-            response = requests.get(url)
-            if response.status_code != 200:
-                continue
-            soup = BeautifulSoup(response.text, "html.parser")
-            for li in soup.select("li"):
-                text = li.get_text()
-                date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
-                print("🔍 Date matches:", date_matches)
-                for match in date_matches:
-                    try:
-                        d = datetime.strptime(match, "%Y-%m-%d")
-                        if start <= d <= end:
-                            a_tag = li.find("a")
-                            if a_tag:
-                                result_titles.append(a_tag.get_text(strip=True))
-                    except:
-                        continue
-        return sorted(set(result_titles))
-    except Exception as e:
-        return [f"Error: {str(e)}"]
 @tool
 def detect_non_commutative_subset(table_text: str) -> str:
     """
@@ -186,55 +271,55 @@ def detect_non_commutative_subset(table_text: str) -> str:
 #     except Exception as e:
 #         return f"Error analyzing table: {str(e)}"
-@tool
-def extract_wikipedia_section_html(page_title: str, section_title: str, mode: str = "html") -> str:
-    """
-    Extracts content from a specific section of a Wikipedia article.
-    Args:
-        page_title (str): The title of the Wikipedia page (e.g., "Mercedes Sosa").
-        section_title (str): The section heading (e.g., "Studio albums").
-        mode (str): Either "html" (default) for raw HTML or "rows" to return cleaned table rows as plain text.
-    Returns:
-        str: The section content based on the mode, or an error message if not found.
-    """
-    print(f"🛠️ extract_wikipedia_section_html called")
-    try:
-        wikipedia.set_lang("en")
-        page = wikipedia.page(page_title)
-        soup = BeautifulSoup(page.html(), "html.parser")
-        # Find the section header
-        header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title.lower() in tag.get_text(strip=True).lower())
-        if not header:
-            return f"Section '{section_title}' not found on page '{page_title}'."
-        if mode == "rows":
-            table = header.find_next("table")
-            if not table:
-                return f"No table found under section '{section_title}'."
-            rows = table.find_all("tr")[1:]  # skip header row
-            lines = []
-            for row in rows:
-                cols = row.find_all(["td", "th"])
-                if len(cols) >= 2:
-                    year = cols[0].get_text(strip=True)
-                    title = cols[1].get_text(strip=True)
-                    lines.append(f"{year}: {title}")
-            return "\n".join(lines) if lines else f"No usable rows found in '{section_title}'."
-        # Default: return all content under section as HTML
-        section_html = []
-        for sibling in header.find_next_siblings():
-            if sibling.name in ["h2", "h3"]:
-                break
-            section_html.append(str(sibling))
-        return "\n".join(section_html) if section_html else f"No content found under section '{section_title}'."
-    except Exception as e:
-        return f"Error extracting section '{section_title}' from page '{page_title}': {str(e)}"
 @tool
 def reverse_sentence(sentence: str) -> str:
@@ -275,10 +360,13 @@ def filter_vegetables(items: list[str]) -> list[str]:
 # List of all tools
 all_tools = [
     # search_featured_articles_by_date_range,
     categorize_grocery_items,
     detect_non_commutative_subset,
     reverse_sentence,
     filter_vegetables,
-    extract_wikipedia_section_html
 ]

 from datetime import datetime, timedelta
 from langchain_core.tools import tool
+@tool
+def route_question(question: str) -> str:
+    """
+    Determines the best tool to answer a given question.
+    Returns: one of 'search_web', 'extract_structured_facts_from_url', or 'use_internal_logic'
+    """
+    q = question.lower()
+    if any(keyword in q for keyword in ["how many", "list", "albums", "awards", "published", "release"]):
+        return "search_web"
+    if any(keyword in q for keyword in ["table", "section", "discography", "infobox", "html"]):
+        return "extract_structured_facts_from_url"
+    # Default to internal logic (math, logic puzzles, wordplay)
+    return "use_internal_logic"
+@tool
+def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
+    """
+    Extract structured facts (tables, bullet lists, or sections) from a webpage.
+    Args:
+        url (str): Target webpage URL.
+        selector (Optional[str]): Optional CSS selector to narrow down the section.
+    Returns:
+        str: Cleaned structured data from the page.
+    """
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        # If a selector is provided, use it directly
+        if selector:
+            elements = soup.select(selector)
+        else:
+            # Fallback to first table or bullet list if no selector provided
+            elements = soup.select("table, ul")
+        if not elements:
+            return "No structured content found."
+        content_lines = []
+        for el in elements:
+            if el.name == "table":
+                for row in el.find_all("tr"):
+                    cols = [td.get_text(strip=True) for td in row.find_all(["td", "th"])]
+                    if cols:
+                        content_lines.append(" | ".join(cols))
+            elif el.name == "ul":
+                items = [li.get_text(strip=True) for li in el.find_all("li")]
+                content_lines.extend(items)
+        return "\n".join(content_lines[:100])  # limit output size
+    except Exception as e:
+        return f"Error extracting structured facts: {str(e)}"
+@tool
+def search_web(query: str) -> List[str]:
+    """
+    Perform a web search using SerpAPI or Brave Search API and return a list of top URLs.
+    Args:
+        query (str): The search query to look up.
+    Returns:
+        List[str]: Top 5 result URLs (may vary depending on API used).
+    """
+    # Example using Brave Search API (you could use SerpAPI or other engine as well)
+    BRAVE_API_KEY = "your_brave_api_key_here"
+    url = "https://api.search.brave.com/res/v1/web/search"
+    headers = {"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY}
+    params = {"q": query, "count": 5}
+    try:
+        response = requests.get(url, headers=headers, params=params, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+        urls = [item["url"] for item in data.get("web", {}).get("results", [])]
+        return urls if urls else ["No results found"]
+    except Exception as e:
+        return [f"Error during web search: {str(e)}"]
 @tool
 def categorize_grocery_items(items: list[str]) -> dict:
     """
     return result
+# @tool
+# def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
+#     """
+#     Searches the English Wikipedia featured article archive and returns article titles
+#     promoted between start_date and end_date.
+#     Args:
+#         start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
+#         end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
+#     Returns:
+#         list[str]: A list of article titles promoted as Featured Articles during that period.
+#     """
+#     print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
+#     try:
+#         base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
+#         archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
+#         start = datetime.strptime(start_date, "%Y-%m-%d")
+#         end = datetime.strptime(end_date, "%Y-%m-%d")
+#         # We'll collect year-specific pages
+#         result_titles = []
+#         for year in range(start.year, end.year + 1):
+#             url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
+#             response = requests.get(url)
+#             if response.status_code != 200:
+#                 continue
+#             soup = BeautifulSoup(response.text, "html.parser")
+#             for li in soup.select("li"):
+#                 text = li.get_text()
+#                 date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
+#                 print("🔍 Date matches:", date_matches)
+#                 for match in date_matches:
+#                     try:
+#                         d = datetime.strptime(match, "%Y-%m-%d")
+#                         if start <= d <= end:
+#                             a_tag = li.find("a")
+#                             if a_tag:
+#                                 result_titles.append(a_tag.get_text(strip=True))
+#                     except:
+#                         continue
+#         return sorted(set(result_titles))
+#     except Exception as e:
+#         return [f"Error: {str(e)}"]
 @tool
 def detect_non_commutative_subset(table_text: str) -> str:
     """
 #     except Exception as e:
 #         return f"Error analyzing table: {str(e)}"
+# @tool
+# def extract_wikipedia_section_html(page_title: str, section_title: str, mode: str = "html") -> str:
+#     """
+#     Extracts content from a specific section of a Wikipedia article.
+#     Args:
+#         page_title (str): The title of the Wikipedia page (e.g., "Mercedes Sosa").
+#         section_title (str): The section heading (e.g., "Studio albums").
+#         mode (str): Either "html" (default) for raw HTML or "rows" to return cleaned table rows as plain text.
+#     Returns:
+#         str: The section content based on the mode, or an error message if not found.
+#     """
+#     print(f"🛠️ extract_wikipedia_section_html called")
+#     try:
+#         wikipedia.set_lang("en")
+#         page = wikipedia.page(page_title)
+#         soup = BeautifulSoup(page.html(), "html.parser")
+#         # Find the section header
+#         header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title.lower() in tag.get_text(strip=True).lower())
+#         if not header:
+#             return f"Section '{section_title}' not found on page '{page_title}'."
+#         if mode == "rows":
+#             table = header.find_next("table")
+#             if not table:
+#                 return f"No table found under section '{section_title}'."
+#             rows = table.find_all("tr")[1:]  # skip header row
+#             lines = []
+#             for row in rows:
+#                 cols = row.find_all(["td", "th"])
+#                 if len(cols) >= 2:
+#                     year = cols[0].get_text(strip=True)
+#                     title = cols[1].get_text(strip=True)
+#                     lines.append(f"{year}: {title}")
+#             return "\n".join(lines) if lines else f"No usable rows found in '{section_title}'."
+#         # Default: return all content under section as HTML
+#         section_html = []
+#         for sibling in header.find_next_siblings():
+#             if sibling.name in ["h2", "h3"]:
+#                 break
+#             section_html.append(str(sibling))
+#         return "\n".join(section_html) if section_html else f"No content found under section '{section_title}'."
+#     except Exception as e:
+#         return f"Error extracting section '{section_title}' from page '{page_title}': {str(e)}"
 @tool
 def reverse_sentence(sentence: str) -> str:
 # List of all tools
 all_tools = [
+    route_question,
+    extract_structured_facts_from_url,
+    search_web,
     # search_featured_articles_by_date_range,
     categorize_grocery_items,
     detect_non_commutative_subset,
     reverse_sentence,
     filter_vegetables,
+    # extract_wikipedia_section_html
 ]