Spaces:
Build error
Build error
Upload 5 files
Browse files- agent_for_unit4/__init__.py +3 -0
- agent_for_unit4/agent.py +113 -0
- agent_for_unit4/db.py +49 -0
- agent_for_unit4/tools.py +205 -0
- agent_for_unit4/wiki.py +180 -0
agent_for_unit4/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .agent import manager_agent, prepare_for_input
|
| 2 |
+
|
| 3 |
+
__all__ = ["manager_agent", "prepare_for_input"]
|
agent_for_unit4/agent.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from textwrap import dedent
|
| 5 |
+
|
| 6 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel, VisitWebpageTool
|
| 7 |
+
|
| 8 |
+
from .tools import RetrieveCSVStorageTool, SpeechRecognitionTool, VisualQATool, WikiTool, fetch_text_content, read_excel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def configure_open_telemetry() -> None:
|
| 12 |
+
try:
|
| 13 |
+
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
| 14 |
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
| 15 |
+
from opentelemetry.sdk.trace import TracerProvider
|
| 16 |
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
| 17 |
+
except ImportError:
|
| 18 |
+
print("OpenTelemetry packages are not installed. Please install them to enable tracing.")
|
| 19 |
+
return None
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
langfuse_public_key = os.environ["LANGFUSE_PUBLIC_KEY"]
|
| 23 |
+
langfuse_secret_key = os.environ["LANGFUSE_SECRET_KEY"]
|
| 24 |
+
except KeyError:
|
| 25 |
+
print("LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY must be set in the environment variables.")
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
LANGFUSE_AUTH = base64.b64encode(f"{langfuse_public_key}:{langfuse_secret_key}".encode()).decode()
|
| 29 |
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel"
|
| 30 |
+
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"
|
| 31 |
+
|
| 32 |
+
trace_provider = TracerProvider()
|
| 33 |
+
trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
|
| 34 |
+
|
| 35 |
+
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
configure_open_telemetry()
|
| 39 |
+
|
| 40 |
+
wiki_storage_tool = RetrieveCSVStorageTool(
|
| 41 |
+
table_name="wiki",
|
| 42 |
+
init_storage=True,
|
| 43 |
+
storage_path="./storage",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
wiki_agent = CodeAgent(
|
| 47 |
+
name="wiki_agent",
|
| 48 |
+
description= """A wiki agent that can search and retrieve information from Wikipedia.
|
| 49 |
+
It is specialized for handling wikipedia articles, and is recommended over web_agent for retrieving information from wikipedia.""",
|
| 50 |
+
model=LiteLLMModel(model_id="openrouter/qwen/qwen-2.5-coder-32b-instruct"),
|
| 51 |
+
tools=[
|
| 52 |
+
DuckDuckGoSearchTool(),
|
| 53 |
+
wiki_storage_tool,
|
| 54 |
+
WikiTool(storage=wiki_storage_tool.get_storage()),
|
| 55 |
+
],
|
| 56 |
+
max_steps=10,
|
| 57 |
+
additional_authorized_imports=["pandas"],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
web_agent = CodeAgent(
|
| 62 |
+
name="web_agent",
|
| 63 |
+
description="A web agent that can search and visit webpages.",
|
| 64 |
+
model=LiteLLMModel(model_id="openrouter/qwen/qwen-2.5-coder-32b-instruct"),
|
| 65 |
+
tools=[
|
| 66 |
+
DuckDuckGoSearchTool(max_results=10),
|
| 67 |
+
VisitWebpageTool(),
|
| 68 |
+
],
|
| 69 |
+
verbosity_level=2,
|
| 70 |
+
max_steps=10,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
manager_agent = CodeAgent(
|
| 75 |
+
name = "manager_agent",
|
| 76 |
+
model=LiteLLMModel(
|
| 77 |
+
model_id="openrouter/qwen/qwq-32b",
|
| 78 |
+
),
|
| 79 |
+
tools=[
|
| 80 |
+
fetch_text_content, # fetch text content from a URL
|
| 81 |
+
SpeechRecognitionTool(), # Audio to text
|
| 82 |
+
VisualQATool(), # Visual Question Answering
|
| 83 |
+
read_excel, # Read Excel files
|
| 84 |
+
],
|
| 85 |
+
managed_agents=[
|
| 86 |
+
wiki_agent,
|
| 87 |
+
web_agent,
|
| 88 |
+
],
|
| 89 |
+
additional_authorized_imports=["pandas", "requests"],
|
| 90 |
+
planning_interval=5,
|
| 91 |
+
verbosity_level=2,
|
| 92 |
+
max_steps=15,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def parse_file_name(file_base_url: str, file_name: str) -> str:
|
| 97 |
+
if file_name == "":
|
| 98 |
+
return "not provided"
|
| 99 |
+
return file_base_url + Path(file_name).stem
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def prepare_for_input(question: dict, file_base_url: str) -> str:
|
| 103 |
+
input_text = dedent(f"""\
|
| 104 |
+
Question:
|
| 105 |
+
{question["question"]}
|
| 106 |
+
|
| 107 |
+
If necessary, use the following file (they may not be provided)
|
| 108 |
+
file_type: {Path(question["file_name"]).suffix}
|
| 109 |
+
file: {parse_file_name(file_base_url, question["file_name"])}
|
| 110 |
+
|
| 111 |
+
Video analysis tools are currently unavailable.
|
| 112 |
+
If the question is about analyzing the video (e.g. questions about Youtube link and mp4), answer 'No Answer'.""")
|
| 113 |
+
return input_text
|
agent_for_unit4/db.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shelve
|
| 2 |
+
import shutil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any, Generic, TypeVar
|
| 5 |
+
|
| 6 |
+
T = TypeVar("T")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ShelveDB(Generic[T]):
|
| 10 |
+
dir_path: Path
|
| 11 |
+
|
| 12 |
+
def __init__(self, db_name: str, init: bool) -> None:
|
| 13 |
+
self.db_path = self.dir_path / db_name
|
| 14 |
+
|
| 15 |
+
if init:
|
| 16 |
+
self.dir_path.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
for file_path in self.dir_path.glob(f"{db_name}*"):
|
| 18 |
+
if file_path.is_file():
|
| 19 |
+
file_path.unlink()
|
| 20 |
+
elif file_path.is_dir():
|
| 21 |
+
shutil.rmtree(file_path)
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def from_table(cls, table: str) -> "ShelveDB":
|
| 25 |
+
return cls(table, False)
|
| 26 |
+
|
| 27 |
+
def save(self, key: str, value: Any) -> None:
|
| 28 |
+
with shelve.open(str(self.db_path)) as db:
|
| 29 |
+
db[key] = value
|
| 30 |
+
|
| 31 |
+
def fetch(self, key: str) -> T | None:
|
| 32 |
+
with shelve.open(str(self.db_path)) as db:
|
| 33 |
+
return db.get(key, None)
|
| 34 |
+
|
| 35 |
+
def delete(self, key: str) -> bool:
|
| 36 |
+
with shelve.open(str(self.db_path)) as db:
|
| 37 |
+
if key in db:
|
| 38 |
+
del db[key]
|
| 39 |
+
return True
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
def clear(self) -> None:
|
| 43 |
+
with shelve.open(str(self.db_path)) as db:
|
| 44 |
+
for key in list(db.keys()):
|
| 45 |
+
del db[key]
|
| 46 |
+
|
| 47 |
+
def list_keys(self) -> list[str]:
|
| 48 |
+
with shelve.open(str(self.db_path)) as db:
|
| 49 |
+
return list(db.keys())
|
agent_for_unit4/tools.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import requests
|
| 8 |
+
from huggingface_hub import InferenceClient
|
| 9 |
+
from smolagents import Tool, tool
|
| 10 |
+
|
| 11 |
+
from .db import ShelveDB
|
| 12 |
+
from .wiki import get_wiki_content
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
### convert table to markdown
|
| 16 |
+
@tool
|
| 17 |
+
def convert_pandas_table_to_markdown(table: pd.DataFrame) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Converts a pandas DataFrame to a markdown table.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
table (pd.DataFrame): The DataFrame to convert.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
str: The markdown representation of the table.
|
| 26 |
+
"""
|
| 27 |
+
return str(table.to_markdown())
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
### fetch text tool
|
| 31 |
+
@tool
|
| 32 |
+
def fetch_text_content(url: str) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Fetches the text content from a given URL.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
url (str): The URL to fetch the text from.
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
str: The text content of the page.
|
| 41 |
+
"""
|
| 42 |
+
try:
|
| 43 |
+
response = requests.get(url, timeout=30)
|
| 44 |
+
response.raise_for_status() # Raise an error for bad responses
|
| 45 |
+
return response.text
|
| 46 |
+
except requests.RequestException as e:
|
| 47 |
+
return f"Error fetching URL: {e}"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
### Storage Tool
|
| 51 |
+
class RetrieveCSVStorageTool(Tool):
|
| 52 |
+
name = "retrieve_csv_storage_tool"
|
| 53 |
+
description = "Retrieves a CSV file from the storage and returns it as a pandas DataFrame."
|
| 54 |
+
inputs = {
|
| 55 |
+
"key": {
|
| 56 |
+
"type": "string",
|
| 57 |
+
"description": "The key to retrieve data from the table.",
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
output_type = "any"
|
| 61 |
+
|
| 62 |
+
def __init__(self, table_name: str, init_storage: bool, storage_path: str | None = None, *args: Any, **kwargs: Any):
|
| 63 |
+
super().__init__(*args, **kwargs)
|
| 64 |
+
if storage_path is not None:
|
| 65 |
+
ShelveDB.dir_path = Path(storage_path)
|
| 66 |
+
self.storage = ShelveDB[pd.DataFrame](table_name, init=init_storage)
|
| 67 |
+
|
| 68 |
+
def get_storage(self) -> ShelveDB[pd.DataFrame]:
|
| 69 |
+
return self.storage
|
| 70 |
+
|
| 71 |
+
def forward(self, key: str) -> pd.DataFrame:
|
| 72 |
+
try:
|
| 73 |
+
# Retrieve the CSV file from storage
|
| 74 |
+
dataframe = self.storage.fetch(key)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
return f"Error retrieving data: {e}"
|
| 77 |
+
else:
|
| 78 |
+
if dataframe is None:
|
| 79 |
+
raise ValueError(f"No data found for key: {key}")
|
| 80 |
+
return dataframe
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
### Wikipedia Content Extraction Tool
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class WikiTool(Tool):
|
| 87 |
+
name = "wiki_tool"
|
| 88 |
+
description = """Get Wikipedia page content and tables.
|
| 89 |
+
Returns a tuple containing the page content and a dictionary of tables extracted from the page.
|
| 90 |
+
The page content is prefixed with the retrieved table key ({{table_1}}, {{table_2}}, ...).
|
| 91 |
+
To understand what is contained in the tables, it is recommended to first display the content.
|
| 92 |
+
Example 1:
|
| 93 |
+
content, tables = get_wiki_content("Python_(programming_language)")
|
| 94 |
+
print(content)
|
| 95 |
+
|
| 96 |
+
The retrieved table object is are stored in storage.
|
| 97 |
+
They can be retrieved using "retrieve_csv_storage_tool".
|
| 98 |
+
Example 2:
|
| 99 |
+
table:pd.DataFrame = retrieve_csv_storage_tool("table_1")
|
| 100 |
+
"""
|
| 101 |
+
inputs = {
|
| 102 |
+
"query": {
|
| 103 |
+
"type": "string",
|
| 104 |
+
"description": "The title of the Wikipedia page to visit. For example, 'Python_(programming_language)'.",
|
| 105 |
+
},
|
| 106 |
+
"language": {
|
| 107 |
+
"type": "string",
|
| 108 |
+
"description": "The language of the Wikipedia page. For example, 'en' for English, 'ja' for Japanese.",
|
| 109 |
+
},
|
| 110 |
+
}
|
| 111 |
+
output_type = "array"
|
| 112 |
+
|
| 113 |
+
def __init__(self, storage: ShelveDB[Any], *args: Any, **kwargs: Any) -> None:
|
| 114 |
+
super().__init__(*args, **kwargs)
|
| 115 |
+
self.storage = storage
|
| 116 |
+
|
| 117 |
+
def forward(self, query: str, language: str) -> tuple[str, dict[str, pd.DataFrame]]:
|
| 118 |
+
content, tables = get_wiki_content(query, language)
|
| 119 |
+
self.storage.clear()
|
| 120 |
+
for table_key, df in tables.items():
|
| 121 |
+
self.storage.save(table_key, df)
|
| 122 |
+
return content, tables
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
### Visual Question Answering Tool
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def request_visual_qa(client: InferenceClient, question: str, image_url: str) -> str:
|
| 129 |
+
contents = [{"type": "text", "text": question}, {"type": "image_url", "image_url": {"url": image_url}}]
|
| 130 |
+
res = client.chat_completion(messages=[{"role": "user", "content": contents}], model="qwen/qwen2.5-vl-32b-instruct")
|
| 131 |
+
content = res.choices[0].message.content
|
| 132 |
+
if content is None:
|
| 133 |
+
raise ValueError("No content returned from the model.")
|
| 134 |
+
return content
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class VisualQATool(Tool):
|
| 138 |
+
name = "visual_qa_tool"
|
| 139 |
+
description = "A tool that can answer questions about image."
|
| 140 |
+
inputs = {
|
| 141 |
+
"image_url": {
|
| 142 |
+
"type": "string",
|
| 143 |
+
"description": "The URL of the image to analyze. No extension needed.",
|
| 144 |
+
},
|
| 145 |
+
"question": {
|
| 146 |
+
"type": "string",
|
| 147 |
+
"description": "The question to ask about the image.",
|
| 148 |
+
},
|
| 149 |
+
}
|
| 150 |
+
output_type = "string"
|
| 151 |
+
client = InferenceClient(
|
| 152 |
+
base_url="https://openrouter.ai/api/v1",
|
| 153 |
+
api_key=os.environ["OPENROUTER_API_KEY"],
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
def forward(self, image_url: str, question: str) -> str:
|
| 157 |
+
try:
|
| 158 |
+
answer = request_visual_qa(self.client, question, image_url)
|
| 159 |
+
except Exception as e:
|
| 160 |
+
return f"Error: {str(e)}"
|
| 161 |
+
else:
|
| 162 |
+
return answer
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
### Speech Recognition Tool
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def request_speech_recognition(client: InferenceClient, audio_file: str, model: str = "openai/whisper-large-v3") -> str:
|
| 169 |
+
output = client.automatic_speech_recognition(audio_file, model=model)
|
| 170 |
+
return output.text
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class SpeechRecognitionTool(Tool):
|
| 174 |
+
name = "speech_recognition"
|
| 175 |
+
description = "Converts audio contents to text"
|
| 176 |
+
inputs = {"audio_url": {"type": "string", "description": "URL of the audio file to transcribe. No extension needed."}}
|
| 177 |
+
output_type = "string"
|
| 178 |
+
client = InferenceClient(provider="fal-ai")
|
| 179 |
+
_model = "openai/whisper-large-v3"
|
| 180 |
+
|
| 181 |
+
def forward(self, audio_url: str) -> str:
|
| 182 |
+
try:
|
| 183 |
+
transcription = request_speech_recognition(self.client, audio_url, model=self._model)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
return f"Error: {str(e)}"
|
| 186 |
+
else:
|
| 187 |
+
return transcription
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
### Excel Tool
|
| 191 |
+
@tool
|
| 192 |
+
def read_excel(file_url: str) -> pd.DataFrame:
|
| 193 |
+
"""
|
| 194 |
+
Reads an Excel file from a given URL and returns the data as a DataFrame.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
file_url (str): URL of the Excel file to read. No extension needed.
|
| 198 |
+
Returns:
|
| 199 |
+
pd.DataFrame: DataFrame containing the data from the first sheet of the Excel file
|
| 200 |
+
"""
|
| 201 |
+
res = requests.get(file_url, timeout=30)
|
| 202 |
+
res.raise_for_status()
|
| 203 |
+
excel_data = BytesIO(res.content)
|
| 204 |
+
df = pd.read_excel(excel_data)
|
| 205 |
+
return df
|
agent_for_unit4/wiki.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from io import StringIO
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def process_list_element(list_element: Any, indent: int = 0) -> str:
|
| 11 |
+
"""リスト要素を再帰的に処理する関数"""
|
| 12 |
+
result = []
|
| 13 |
+
|
| 14 |
+
is_ordered = list_element.name == "ol"
|
| 15 |
+
|
| 16 |
+
for i, li in enumerate(list_element.find_all("li", recursive=False)):
|
| 17 |
+
# リスト項目のテキストを取得
|
| 18 |
+
# ネストされたリストを除いたテキストを取得
|
| 19 |
+
item_text = ""
|
| 20 |
+
for content in li.contents:
|
| 21 |
+
if content.name not in ["ul", "ol"]:
|
| 22 |
+
item_text += str(content)
|
| 23 |
+
|
| 24 |
+
item_text = BeautifulSoup(item_text, "html.parser").get_text().strip()
|
| 25 |
+
|
| 26 |
+
# 順序付きリストなら番号を、そうでなければ記号を使用
|
| 27 |
+
prefix = " " * indent + (f"{i + 1}. " if is_ordered else "* ")
|
| 28 |
+
if item_text:
|
| 29 |
+
result.append(prefix + item_text)
|
| 30 |
+
|
| 31 |
+
# ネストされたリストを処理
|
| 32 |
+
for nested_list in li.find_all(["ul", "ol"], recursive=False):
|
| 33 |
+
nested_content = process_list_element(nested_list, indent + 1)
|
| 34 |
+
if nested_content:
|
| 35 |
+
result.append(nested_content)
|
| 36 |
+
|
| 37 |
+
return "\n".join(result)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_wiki_content(title: str, language: str = "en") -> tuple[str, dict[str, pd.DataFrame]]:
|
| 41 |
+
"""
|
| 42 |
+
Get Wikipedia page content and tables.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
A tuple containing the page content as a string and a dictionary of tables
|
| 46 |
+
extracted from the page. The keys of the dictionary are "table_1", "table_2", etc.
|
| 47 |
+
and the values are pandas DataFrames representing the tables.
|
| 48 |
+
|
| 49 |
+
Example:
|
| 50 |
+
content, tables = get_wiki_content("Python_(programming_language)")
|
| 51 |
+
print(content)
|
| 52 |
+
print(tables["table_1"]) # Access the first table
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
title: wikipedia page title (e.g., "Python_(programming_language)")
|
| 56 |
+
language: wikipedia language (e.g., "en" for English, "ja" for Japanese)
|
| 57 |
+
"""
|
| 58 |
+
# パースAPIのURLを構築
|
| 59 |
+
api_url = f"https://{language}.wikipedia.org/w/api.php"
|
| 60 |
+
|
| 61 |
+
# APIパラメータ
|
| 62 |
+
params = {
|
| 63 |
+
"action": "parse",
|
| 64 |
+
"page": title,
|
| 65 |
+
"format": "json",
|
| 66 |
+
"prop": "text",
|
| 67 |
+
"disabletoc": True,
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
# リクエストを送信
|
| 71 |
+
response = requests.get(api_url, params=params, timeout=30) # type: ignore
|
| 72 |
+
|
| 73 |
+
# レスポンスをチェック
|
| 74 |
+
if response.status_code != 200:
|
| 75 |
+
raise Exception(f"api error: {response.status_code} - {response.text}")
|
| 76 |
+
|
| 77 |
+
# JSONレスポンスをパース
|
| 78 |
+
data = response.json()
|
| 79 |
+
|
| 80 |
+
# エラーチェック
|
| 81 |
+
if "error" in data:
|
| 82 |
+
raise Exception(f"api error: {data['error']['info']}")
|
| 83 |
+
|
| 84 |
+
if "parse" not in data:
|
| 85 |
+
raise Exception("api error: No parse data found")
|
| 86 |
+
|
| 87 |
+
# HTMLコンテンツを取得
|
| 88 |
+
html_content = data["parse"]["text"]["*"]
|
| 89 |
+
|
| 90 |
+
# HTMLをパース
|
| 91 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 92 |
+
content_soup = BeautifulSoup(html_content, "html.parser")
|
| 93 |
+
|
| 94 |
+
# テーブル情報を取得
|
| 95 |
+
tables_dict: dict[str, pd.DataFrame] = {}
|
| 96 |
+
table_ids: list[tuple[str, str]] = [] # (table_id, table_html) のリスト
|
| 97 |
+
|
| 98 |
+
# ターゲットとするテーブルを特定: wikitableとinfobox
|
| 99 |
+
table_index = 1
|
| 100 |
+
|
| 101 |
+
# まず、infobox(バイオグラフィーテーブル)を処理
|
| 102 |
+
infoboxes = soup.find_all("table", class_=lambda c: c and "infobox" in c)
|
| 103 |
+
for i, table in enumerate(infoboxes):
|
| 104 |
+
table_id = f"table_{table_index}"
|
| 105 |
+
table_ids.append((table_id, str(table)))
|
| 106 |
+
table_index += 1
|
| 107 |
+
|
| 108 |
+
# 次に、wikitableを処理
|
| 109 |
+
wikitables = soup.find_all("table", class_="wikitable")
|
| 110 |
+
for i, table in enumerate(wikitables):
|
| 111 |
+
table_id = f"table_{table_index}"
|
| 112 |
+
table_ids.append((table_id, str(table)))
|
| 113 |
+
table_index += 1
|
| 114 |
+
|
| 115 |
+
# 抽出したテーブルをpandasで処理
|
| 116 |
+
for table_id, table_html in table_ids:
|
| 117 |
+
try:
|
| 118 |
+
dfs = pd.read_html(StringIO(table_html))
|
| 119 |
+
if dfs:
|
| 120 |
+
tables_dict[table_id] = dfs[0]
|
| 121 |
+
except Exception:
|
| 122 |
+
# テーブル解析に失敗した場合はスキップ
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
# コンテンツ内のテーブルをプレースホルダに置き換え
|
| 126 |
+
table_placeholders: dict[str, str] = {}
|
| 127 |
+
|
| 128 |
+
# infoboxの処理
|
| 129 |
+
for i, table in enumerate(content_soup.find_all("table", class_=lambda c: c and "infobox" in c)):
|
| 130 |
+
table_id = f"table_{i + 1}"
|
| 131 |
+
if table_id in tables_dict:
|
| 132 |
+
placeholder = f"{{{{{table_id}}}}}"
|
| 133 |
+
table_placeholders[table_id] = placeholder
|
| 134 |
+
table_placeholder_tag = content_soup.new_tag("p")
|
| 135 |
+
table_placeholder_tag.string = placeholder
|
| 136 |
+
table.replace_with(table_placeholder_tag)
|
| 137 |
+
|
| 138 |
+
# wikitableの処理(インデックスは続きから)
|
| 139 |
+
wikitable_start_index = len(infoboxes) + 1
|
| 140 |
+
for i, table in enumerate(content_soup.find_all("table", class_="wikitable")):
|
| 141 |
+
table_id = f"table_{wikitable_start_index + i}"
|
| 142 |
+
if table_id in tables_dict:
|
| 143 |
+
placeholder = f"{{{{{table_id}}}}}"
|
| 144 |
+
table_placeholders[table_id] = placeholder
|
| 145 |
+
table_placeholder_tag = content_soup.new_tag("p")
|
| 146 |
+
table_placeholder_tag.string = placeholder
|
| 147 |
+
table.replace_with(table_placeholder_tag)
|
| 148 |
+
|
| 149 |
+
# クリーンな本文テキストを抽出
|
| 150 |
+
for element in content_soup.find_all(["sup", "div.hatnote", "div.navbox", "span.mw-editsection"]):
|
| 151 |
+
element.decompose()
|
| 152 |
+
|
| 153 |
+
# 見出し、パラグラフ、リストを取得
|
| 154 |
+
elements = content_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol"])
|
| 155 |
+
text_content = []
|
| 156 |
+
|
| 157 |
+
for element in elements:
|
| 158 |
+
if element.name and element.name.startswith("h"): # type: ignore
|
| 159 |
+
level = int(element.name[1]) # type: ignore
|
| 160 |
+
heading_text = element.get_text().strip()
|
| 161 |
+
if heading_text: # 空の見出しをスキップ
|
| 162 |
+
text_content.append("\n" + "#" * level + " " + heading_text)
|
| 163 |
+
elif element.name == "p": # type: ignore
|
| 164 |
+
paragraph_text = element.get_text().strip()
|
| 165 |
+
if paragraph_text: # 空のパラグラフをスキップ
|
| 166 |
+
# テーブルプレースホルダの場合はそのまま追加
|
| 167 |
+
if re.match(r"^\{\{table_\d+\}\}$", paragraph_text):
|
| 168 |
+
text_content.append(paragraph_text)
|
| 169 |
+
else:
|
| 170 |
+
text_content.append(paragraph_text)
|
| 171 |
+
elif element.name in ["ul", "ol"] and element.parent.name not in ["li", "ul", "ol"]: # type: ignore
|
| 172 |
+
# トップレベルのリストのみ処理(ネストされたものは親liで処理)
|
| 173 |
+
list_content = process_list_element(element)
|
| 174 |
+
if list_content:
|
| 175 |
+
text_content.append(list_content)
|
| 176 |
+
|
| 177 |
+
# テキストコンテンツを結合
|
| 178 |
+
content = "\n\n".join(text_content)
|
| 179 |
+
|
| 180 |
+
return content, tables_dict
|