Update tools.py
Browse files
tools.py
CHANGED
@@ -20,6 +20,91 @@ import re
|
|
20 |
from datetime import datetime, timedelta
|
21 |
from langchain_core.tools import tool
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
@tool
|
24 |
def categorize_grocery_items(items: list[str]) -> dict:
|
25 |
"""
|
@@ -64,53 +149,53 @@ def categorize_grocery_items(items: list[str]) -> dict:
|
|
64 |
return result
|
65 |
|
66 |
|
67 |
-
@tool
|
68 |
-
def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
@tool
|
115 |
def detect_non_commutative_subset(table_text: str) -> str:
|
116 |
"""
|
@@ -186,55 +271,55 @@ def detect_non_commutative_subset(table_text: str) -> str:
|
|
186 |
# except Exception as e:
|
187 |
# return f"Error analyzing table: {str(e)}"
|
188 |
|
189 |
-
@tool
|
190 |
-
def extract_wikipedia_section_html(page_title: str, section_title: str, mode: str = "html") -> str:
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
print(f"🛠️ extract_wikipedia_section_html called")
|
202 |
|
203 |
-
|
204 |
-
wikipedia.set_lang("en")
|
205 |
-
page = wikipedia.page(page_title)
|
206 |
-
soup = BeautifulSoup(page.html(), "html.parser")
|
207 |
-
|
208 |
-
# Find the section header
|
209 |
-
header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title.lower() in tag.get_text(strip=True).lower())
|
210 |
-
if not header:
|
211 |
-
return f"Section '{section_title}' not found on page '{page_title}'."
|
212 |
-
|
213 |
-
if mode == "rows":
|
214 |
-
table = header.find_next("table")
|
215 |
-
if not table:
|
216 |
-
return f"No table found under section '{section_title}'."
|
217 |
-
rows = table.find_all("tr")[1:] # skip header row
|
218 |
-
lines = []
|
219 |
-
for row in rows:
|
220 |
-
cols = row.find_all(["td", "th"])
|
221 |
-
if len(cols) >= 2:
|
222 |
-
year = cols[0].get_text(strip=True)
|
223 |
-
title = cols[1].get_text(strip=True)
|
224 |
-
lines.append(f"{year}: {title}")
|
225 |
-
return "\n".join(lines) if lines else f"No usable rows found in '{section_title}'."
|
226 |
-
|
227 |
-
# Default: return all content under section as HTML
|
228 |
-
section_html = []
|
229 |
-
for sibling in header.find_next_siblings():
|
230 |
-
if sibling.name in ["h2", "h3"]:
|
231 |
-
break
|
232 |
-
section_html.append(str(sibling))
|
233 |
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
-
|
237 |
-
|
238 |
|
239 |
@tool
|
240 |
def reverse_sentence(sentence: str) -> str:
|
@@ -275,10 +360,13 @@ def filter_vegetables(items: list[str]) -> list[str]:
|
|
275 |
|
276 |
# List of all tools
|
277 |
all_tools = [
|
|
|
|
|
|
|
278 |
# search_featured_articles_by_date_range,
|
279 |
categorize_grocery_items,
|
280 |
detect_non_commutative_subset,
|
281 |
reverse_sentence,
|
282 |
filter_vegetables,
|
283 |
-
extract_wikipedia_section_html
|
284 |
]
|
|
|
20 |
from datetime import datetime, timedelta
|
21 |
from langchain_core.tools import tool
|
22 |
|
23 |
+
@tool
|
24 |
+
def route_question(question: str) -> str:
|
25 |
+
"""
|
26 |
+
Determines the best tool to answer a given question.
|
27 |
+
Returns: one of 'search_web', 'extract_structured_facts_from_url', or 'use_internal_logic'
|
28 |
+
"""
|
29 |
+
q = question.lower()
|
30 |
+
|
31 |
+
if any(keyword in q for keyword in ["how many", "list", "albums", "awards", "published", "release"]):
|
32 |
+
return "search_web"
|
33 |
+
|
34 |
+
if any(keyword in q for keyword in ["table", "section", "discography", "infobox", "html"]):
|
35 |
+
return "extract_structured_facts_from_url"
|
36 |
+
|
37 |
+
# Default to internal logic (math, logic puzzles, wordplay)
|
38 |
+
return "use_internal_logic"
|
39 |
+
|
40 |
+
@tool
|
41 |
+
def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
|
42 |
+
"""
|
43 |
+
Extract structured facts (tables, bullet lists, or sections) from a webpage.
|
44 |
+
Args:
|
45 |
+
url (str): Target webpage URL.
|
46 |
+
selector (Optional[str]): Optional CSS selector to narrow down the section.
|
47 |
+
Returns:
|
48 |
+
str: Cleaned structured data from the page.
|
49 |
+
"""
|
50 |
+
try:
|
51 |
+
response = requests.get(url, timeout=10)
|
52 |
+
response.raise_for_status()
|
53 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
54 |
+
|
55 |
+
# If a selector is provided, use it directly
|
56 |
+
if selector:
|
57 |
+
elements = soup.select(selector)
|
58 |
+
else:
|
59 |
+
# Fallback to first table or bullet list if no selector provided
|
60 |
+
elements = soup.select("table, ul")
|
61 |
+
|
62 |
+
if not elements:
|
63 |
+
return "No structured content found."
|
64 |
+
|
65 |
+
content_lines = []
|
66 |
+
for el in elements:
|
67 |
+
if el.name == "table":
|
68 |
+
for row in el.find_all("tr"):
|
69 |
+
cols = [td.get_text(strip=True) for td in row.find_all(["td", "th"])]
|
70 |
+
if cols:
|
71 |
+
content_lines.append(" | ".join(cols))
|
72 |
+
elif el.name == "ul":
|
73 |
+
items = [li.get_text(strip=True) for li in el.find_all("li")]
|
74 |
+
content_lines.extend(items)
|
75 |
+
|
76 |
+
return "\n".join(content_lines[:100]) # limit output size
|
77 |
+
|
78 |
+
except Exception as e:
|
79 |
+
return f"Error extracting structured facts: {str(e)}"
|
80 |
+
|
81 |
+
@tool
|
82 |
+
def search_web(query: str) -> List[str]:
|
83 |
+
"""
|
84 |
+
Perform a web search using SerpAPI or Brave Search API and return a list of top URLs.
|
85 |
+
Args:
|
86 |
+
query (str): The search query to look up.
|
87 |
+
Returns:
|
88 |
+
List[str]: Top 5 result URLs (may vary depending on API used).
|
89 |
+
"""
|
90 |
+
# Example using Brave Search API (you could use SerpAPI or other engine as well)
|
91 |
+
BRAVE_API_KEY = "your_brave_api_key_here"
|
92 |
+
url = "https://api.search.brave.com/res/v1/web/search"
|
93 |
+
headers = {"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY}
|
94 |
+
params = {"q": query, "count": 5}
|
95 |
+
|
96 |
+
try:
|
97 |
+
response = requests.get(url, headers=headers, params=params, timeout=10)
|
98 |
+
response.raise_for_status()
|
99 |
+
data = response.json()
|
100 |
+
|
101 |
+
urls = [item["url"] for item in data.get("web", {}).get("results", [])]
|
102 |
+
return urls if urls else ["No results found"]
|
103 |
+
|
104 |
+
except Exception as e:
|
105 |
+
return [f"Error during web search: {str(e)}"]
|
106 |
+
|
107 |
+
|
108 |
@tool
|
109 |
def categorize_grocery_items(items: list[str]) -> dict:
|
110 |
"""
|
|
|
149 |
return result
|
150 |
|
151 |
|
152 |
+
# @tool
|
153 |
+
# def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
|
154 |
+
# """
|
155 |
+
# Searches the English Wikipedia featured article archive and returns article titles
|
156 |
+
# promoted between start_date and end_date.
|
157 |
+
# Args:
|
158 |
+
# start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
|
159 |
+
# end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
|
160 |
+
# Returns:
|
161 |
+
# list[str]: A list of article titles promoted as Featured Articles during that period.
|
162 |
+
# """
|
163 |
+
# print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
|
164 |
+
# try:
|
165 |
+
# base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
|
166 |
+
# archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
|
167 |
+
|
168 |
+
# start = datetime.strptime(start_date, "%Y-%m-%d")
|
169 |
+
# end = datetime.strptime(end_date, "%Y-%m-%d")
|
170 |
+
|
171 |
+
# # We'll collect year-specific pages
|
172 |
+
# result_titles = []
|
173 |
+
|
174 |
+
# for year in range(start.year, end.year + 1):
|
175 |
+
# url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
|
176 |
+
# response = requests.get(url)
|
177 |
+
# if response.status_code != 200:
|
178 |
+
# continue
|
179 |
+
|
180 |
+
# soup = BeautifulSoup(response.text, "html.parser")
|
181 |
+
# for li in soup.select("li"):
|
182 |
+
# text = li.get_text()
|
183 |
+
# date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
|
184 |
+
# print("🔍 Date matches:", date_matches)
|
185 |
|
186 |
+
# for match in date_matches:
|
187 |
+
# try:
|
188 |
+
# d = datetime.strptime(match, "%Y-%m-%d")
|
189 |
+
# if start <= d <= end:
|
190 |
+
# a_tag = li.find("a")
|
191 |
+
# if a_tag:
|
192 |
+
# result_titles.append(a_tag.get_text(strip=True))
|
193 |
+
# except:
|
194 |
+
# continue
|
195 |
+
|
196 |
+
# return sorted(set(result_titles))
|
197 |
+
# except Exception as e:
|
198 |
+
# return [f"Error: {str(e)}"]
|
199 |
@tool
|
200 |
def detect_non_commutative_subset(table_text: str) -> str:
|
201 |
"""
|
|
|
271 |
# except Exception as e:
|
272 |
# return f"Error analyzing table: {str(e)}"
|
273 |
|
274 |
+
# @tool
|
275 |
+
# def extract_wikipedia_section_html(page_title: str, section_title: str, mode: str = "html") -> str:
|
276 |
+
# """
|
277 |
+
# Extracts content from a specific section of a Wikipedia article.
|
278 |
+
# Args:
|
279 |
+
# page_title (str): The title of the Wikipedia page (e.g., "Mercedes Sosa").
|
280 |
+
# section_title (str): The section heading (e.g., "Studio albums").
|
281 |
+
# mode (str): Either "html" (default) for raw HTML or "rows" to return cleaned table rows as plain text.
|
282 |
+
# Returns:
|
283 |
+
# str: The section content based on the mode, or an error message if not found.
|
284 |
+
# """
|
|
|
|
|
285 |
|
286 |
+
# print(f"🛠️ extract_wikipedia_section_html called")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
+
# try:
|
289 |
+
# wikipedia.set_lang("en")
|
290 |
+
# page = wikipedia.page(page_title)
|
291 |
+
# soup = BeautifulSoup(page.html(), "html.parser")
|
292 |
+
|
293 |
+
# # Find the section header
|
294 |
+
# header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title.lower() in tag.get_text(strip=True).lower())
|
295 |
+
# if not header:
|
296 |
+
# return f"Section '{section_title}' not found on page '{page_title}'."
|
297 |
+
|
298 |
+
# if mode == "rows":
|
299 |
+
# table = header.find_next("table")
|
300 |
+
# if not table:
|
301 |
+
# return f"No table found under section '{section_title}'."
|
302 |
+
# rows = table.find_all("tr")[1:] # skip header row
|
303 |
+
# lines = []
|
304 |
+
# for row in rows:
|
305 |
+
# cols = row.find_all(["td", "th"])
|
306 |
+
# if len(cols) >= 2:
|
307 |
+
# year = cols[0].get_text(strip=True)
|
308 |
+
# title = cols[1].get_text(strip=True)
|
309 |
+
# lines.append(f"{year}: {title}")
|
310 |
+
# return "\n".join(lines) if lines else f"No usable rows found in '{section_title}'."
|
311 |
+
|
312 |
+
# # Default: return all content under section as HTML
|
313 |
+
# section_html = []
|
314 |
+
# for sibling in header.find_next_siblings():
|
315 |
+
# if sibling.name in ["h2", "h3"]:
|
316 |
+
# break
|
317 |
+
# section_html.append(str(sibling))
|
318 |
+
|
319 |
+
# return "\n".join(section_html) if section_html else f"No content found under section '{section_title}'."
|
320 |
|
321 |
+
# except Exception as e:
|
322 |
+
# return f"Error extracting section '{section_title}' from page '{page_title}': {str(e)}"
|
323 |
|
324 |
@tool
|
325 |
def reverse_sentence(sentence: str) -> str:
|
|
|
360 |
|
361 |
# List of all tools
|
362 |
all_tools = [
|
363 |
+
route_question,
|
364 |
+
extract_structured_facts_from_url,
|
365 |
+
search_web,
|
366 |
# search_featured_articles_by_date_range,
|
367 |
categorize_grocery_items,
|
368 |
detect_non_commutative_subset,
|
369 |
reverse_sentence,
|
370 |
filter_vegetables,
|
371 |
+
# extract_wikipedia_section_html
|
372 |
]
|