alisamak commited on
Commit
00b8b9c
·
verified ·
1 Parent(s): 1abcd94

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +181 -93
tools.py CHANGED
@@ -20,6 +20,91 @@ import re
20
  from datetime import datetime, timedelta
21
  from langchain_core.tools import tool
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  @tool
24
  def categorize_grocery_items(items: list[str]) -> dict:
25
  """
@@ -64,53 +149,53 @@ def categorize_grocery_items(items: list[str]) -> dict:
64
  return result
65
 
66
 
67
- @tool
68
- def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
69
- """
70
- Searches the English Wikipedia featured article archive and returns article titles
71
- promoted between start_date and end_date.
72
- Args:
73
- start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
74
- end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
75
- Returns:
76
- list[str]: A list of article titles promoted as Featured Articles during that period.
77
- """
78
- print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
79
- try:
80
- base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
81
- archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
82
-
83
- start = datetime.strptime(start_date, "%Y-%m-%d")
84
- end = datetime.strptime(end_date, "%Y-%m-%d")
85
-
86
- # We'll collect year-specific pages
87
- result_titles = []
88
-
89
- for year in range(start.year, end.year + 1):
90
- url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
91
- response = requests.get(url)
92
- if response.status_code != 200:
93
- continue
94
-
95
- soup = BeautifulSoup(response.text, "html.parser")
96
- for li in soup.select("li"):
97
- text = li.get_text()
98
- date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
99
- print("🔍 Date matches:", date_matches)
100
 
101
- for match in date_matches:
102
- try:
103
- d = datetime.strptime(match, "%Y-%m-%d")
104
- if start <= d <= end:
105
- a_tag = li.find("a")
106
- if a_tag:
107
- result_titles.append(a_tag.get_text(strip=True))
108
- except:
109
- continue
110
-
111
- return sorted(set(result_titles))
112
- except Exception as e:
113
- return [f"Error: {str(e)}"]
114
  @tool
115
  def detect_non_commutative_subset(table_text: str) -> str:
116
  """
@@ -186,55 +271,55 @@ def detect_non_commutative_subset(table_text: str) -> str:
186
  # except Exception as e:
187
  # return f"Error analyzing table: {str(e)}"
188
 
189
- @tool
190
- def extract_wikipedia_section_html(page_title: str, section_title: str, mode: str = "html") -> str:
191
- """
192
- Extracts content from a specific section of a Wikipedia article.
193
- Args:
194
- page_title (str): The title of the Wikipedia page (e.g., "Mercedes Sosa").
195
- section_title (str): The section heading (e.g., "Studio albums").
196
- mode (str): Either "html" (default) for raw HTML or "rows" to return cleaned table rows as plain text.
197
- Returns:
198
- str: The section content based on the mode, or an error message if not found.
199
- """
200
-
201
- print(f"🛠️ extract_wikipedia_section_html called")
202
 
203
- try:
204
- wikipedia.set_lang("en")
205
- page = wikipedia.page(page_title)
206
- soup = BeautifulSoup(page.html(), "html.parser")
207
-
208
- # Find the section header
209
- header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title.lower() in tag.get_text(strip=True).lower())
210
- if not header:
211
- return f"Section '{section_title}' not found on page '{page_title}'."
212
-
213
- if mode == "rows":
214
- table = header.find_next("table")
215
- if not table:
216
- return f"No table found under section '{section_title}'."
217
- rows = table.find_all("tr")[1:] # skip header row
218
- lines = []
219
- for row in rows:
220
- cols = row.find_all(["td", "th"])
221
- if len(cols) >= 2:
222
- year = cols[0].get_text(strip=True)
223
- title = cols[1].get_text(strip=True)
224
- lines.append(f"{year}: {title}")
225
- return "\n".join(lines) if lines else f"No usable rows found in '{section_title}'."
226
-
227
- # Default: return all content under section as HTML
228
- section_html = []
229
- for sibling in header.find_next_siblings():
230
- if sibling.name in ["h2", "h3"]:
231
- break
232
- section_html.append(str(sibling))
233
 
234
- return "\n".join(section_html) if section_html else f"No content found under section '{section_title}'."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- except Exception as e:
237
- return f"Error extracting section '{section_title}' from page '{page_title}': {str(e)}"
238
 
239
  @tool
240
  def reverse_sentence(sentence: str) -> str:
@@ -275,10 +360,13 @@ def filter_vegetables(items: list[str]) -> list[str]:
275
 
276
  # List of all tools
277
  all_tools = [
 
 
 
278
  # search_featured_articles_by_date_range,
279
  categorize_grocery_items,
280
  detect_non_commutative_subset,
281
  reverse_sentence,
282
  filter_vegetables,
283
- extract_wikipedia_section_html
284
  ]
 
20
  from datetime import datetime, timedelta
21
  from langchain_core.tools import tool
22
 
23
+ @tool
24
+ def route_question(question: str) -> str:
25
+ """
26
+ Determines the best tool to answer a given question.
27
+ Returns: one of 'search_web', 'extract_structured_facts_from_url', or 'use_internal_logic'
28
+ """
29
+ q = question.lower()
30
+
31
+ if any(keyword in q for keyword in ["how many", "list", "albums", "awards", "published", "release"]):
32
+ return "search_web"
33
+
34
+ if any(keyword in q for keyword in ["table", "section", "discography", "infobox", "html"]):
35
+ return "extract_structured_facts_from_url"
36
+
37
+ # Default to internal logic (math, logic puzzles, wordplay)
38
+ return "use_internal_logic"
39
+
40
+ @tool
41
+ def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
42
+ """
43
+ Extract structured facts (tables, bullet lists, or sections) from a webpage.
44
+ Args:
45
+ url (str): Target webpage URL.
46
+ selector (Optional[str]): Optional CSS selector to narrow down the section.
47
+ Returns:
48
+ str: Cleaned structured data from the page.
49
+ """
50
+ try:
51
+ response = requests.get(url, timeout=10)
52
+ response.raise_for_status()
53
+ soup = BeautifulSoup(response.text, "html.parser")
54
+
55
+ # If a selector is provided, use it directly
56
+ if selector:
57
+ elements = soup.select(selector)
58
+ else:
59
+ # Fallback to first table or bullet list if no selector provided
60
+ elements = soup.select("table, ul")
61
+
62
+ if not elements:
63
+ return "No structured content found."
64
+
65
+ content_lines = []
66
+ for el in elements:
67
+ if el.name == "table":
68
+ for row in el.find_all("tr"):
69
+ cols = [td.get_text(strip=True) for td in row.find_all(["td", "th"])]
70
+ if cols:
71
+ content_lines.append(" | ".join(cols))
72
+ elif el.name == "ul":
73
+ items = [li.get_text(strip=True) for li in el.find_all("li")]
74
+ content_lines.extend(items)
75
+
76
+ return "\n".join(content_lines[:100]) # limit output size
77
+
78
+ except Exception as e:
79
+ return f"Error extracting structured facts: {str(e)}"
80
+
81
+ @tool
82
+ def search_web(query: str) -> List[str]:
83
+ """
84
+ Perform a web search using SerpAPI or Brave Search API and return a list of top URLs.
85
+ Args:
86
+ query (str): The search query to look up.
87
+ Returns:
88
+ List[str]: Top 5 result URLs (may vary depending on API used).
89
+ """
90
+ # Example using Brave Search API (you could use SerpAPI or other engine as well)
91
+ BRAVE_API_KEY = "your_brave_api_key_here"
92
+ url = "https://api.search.brave.com/res/v1/web/search"
93
+ headers = {"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY}
94
+ params = {"q": query, "count": 5}
95
+
96
+ try:
97
+ response = requests.get(url, headers=headers, params=params, timeout=10)
98
+ response.raise_for_status()
99
+ data = response.json()
100
+
101
+ urls = [item["url"] for item in data.get("web", {}).get("results", [])]
102
+ return urls if urls else ["No results found"]
103
+
104
+ except Exception as e:
105
+ return [f"Error during web search: {str(e)}"]
106
+
107
+
108
  @tool
109
  def categorize_grocery_items(items: list[str]) -> dict:
110
  """
 
149
  return result
150
 
151
 
152
+ # @tool
153
+ # def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
154
+ # """
155
+ # Searches the English Wikipedia featured article archive and returns article titles
156
+ # promoted between start_date and end_date.
157
+ # Args:
158
+ # start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
159
+ # end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
160
+ # Returns:
161
+ # list[str]: A list of article titles promoted as Featured Articles during that period.
162
+ # """
163
+ # print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
164
+ # try:
165
+ # base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
166
+ # archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
167
+
168
+ # start = datetime.strptime(start_date, "%Y-%m-%d")
169
+ # end = datetime.strptime(end_date, "%Y-%m-%d")
170
+
171
+ # # We'll collect year-specific pages
172
+ # result_titles = []
173
+
174
+ # for year in range(start.year, end.year + 1):
175
+ # url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
176
+ # response = requests.get(url)
177
+ # if response.status_code != 200:
178
+ # continue
179
+
180
+ # soup = BeautifulSoup(response.text, "html.parser")
181
+ # for li in soup.select("li"):
182
+ # text = li.get_text()
183
+ # date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
184
+ # print("🔍 Date matches:", date_matches)
185
 
186
+ # for match in date_matches:
187
+ # try:
188
+ # d = datetime.strptime(match, "%Y-%m-%d")
189
+ # if start <= d <= end:
190
+ # a_tag = li.find("a")
191
+ # if a_tag:
192
+ # result_titles.append(a_tag.get_text(strip=True))
193
+ # except:
194
+ # continue
195
+
196
+ # return sorted(set(result_titles))
197
+ # except Exception as e:
198
+ # return [f"Error: {str(e)}"]
199
  @tool
200
  def detect_non_commutative_subset(table_text: str) -> str:
201
  """
 
271
  # except Exception as e:
272
  # return f"Error analyzing table: {str(e)}"
273
 
274
+ # @tool
275
+ # def extract_wikipedia_section_html(page_title: str, section_title: str, mode: str = "html") -> str:
276
+ # """
277
+ # Extracts content from a specific section of a Wikipedia article.
278
+ # Args:
279
+ # page_title (str): The title of the Wikipedia page (e.g., "Mercedes Sosa").
280
+ # section_title (str): The section heading (e.g., "Studio albums").
281
+ # mode (str): Either "html" (default) for raw HTML or "rows" to return cleaned table rows as plain text.
282
+ # Returns:
283
+ # str: The section content based on the mode, or an error message if not found.
284
+ # """
 
 
285
 
286
+ # print(f"🛠️ extract_wikipedia_section_html called")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
+ # try:
289
+ # wikipedia.set_lang("en")
290
+ # page = wikipedia.page(page_title)
291
+ # soup = BeautifulSoup(page.html(), "html.parser")
292
+
293
+ # # Find the section header
294
+ # header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title.lower() in tag.get_text(strip=True).lower())
295
+ # if not header:
296
+ # return f"Section '{section_title}' not found on page '{page_title}'."
297
+
298
+ # if mode == "rows":
299
+ # table = header.find_next("table")
300
+ # if not table:
301
+ # return f"No table found under section '{section_title}'."
302
+ # rows = table.find_all("tr")[1:] # skip header row
303
+ # lines = []
304
+ # for row in rows:
305
+ # cols = row.find_all(["td", "th"])
306
+ # if len(cols) >= 2:
307
+ # year = cols[0].get_text(strip=True)
308
+ # title = cols[1].get_text(strip=True)
309
+ # lines.append(f"{year}: {title}")
310
+ # return "\n".join(lines) if lines else f"No usable rows found in '{section_title}'."
311
+
312
+ # # Default: return all content under section as HTML
313
+ # section_html = []
314
+ # for sibling in header.find_next_siblings():
315
+ # if sibling.name in ["h2", "h3"]:
316
+ # break
317
+ # section_html.append(str(sibling))
318
+
319
+ # return "\n".join(section_html) if section_html else f"No content found under section '{section_title}'."
320
 
321
+ # except Exception as e:
322
+ # return f"Error extracting section '{section_title}' from page '{page_title}': {str(e)}"
323
 
324
  @tool
325
  def reverse_sentence(sentence: str) -> str:
 
360
 
361
  # List of all tools
362
  all_tools = [
363
+ route_question,
364
+ extract_structured_facts_from_url,
365
+ search_web,
366
  # search_featured_articles_by_date_range,
367
  categorize_grocery_items,
368
  detect_non_commutative_subset,
369
  reverse_sentence,
370
  filter_vegetables,
371
+ # extract_wikipedia_section_html
372
  ]