File size: 3,194 Bytes
bd70f31
70a30ed
3213b51
 
 
 
 
 
 
 
bd70f31
3213b51
bd70f31
3213b51
bd70f31
 
 
 
 
 
3213b51
 
 
 
 
 
 
bd70f31
70a30ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd70f31
3213b51
 
70a30ed
 
3213b51
 
 
 
70a30ed
3213b51
bd70f31
 
3213b51
 
 
bd70f31
3213b51
 
 
 
 
 
 
70a30ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3213b51
 
 
 
 
 
bd70f31
 
 
3213b51
bd70f31
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import time
from smolagents import (
    CodeAgent,
    LiteLLMModel,
    VisitWebpageTool,
    GoogleSearchTool,
    MCPClient,
    OpenAIServerModel
)

FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")

def get_anthropic_model():
    model = LiteLLMModel(
        model_id="claude-opus-4-20250514",
        api_key=os.getenv("ANTHROPIC_API_KEY"),
    )
    return model

def get_openai_model():
    model = OpenAIServerModel(
        model_id="gpt-4o",
        api_base="https://api.openai.com/v1",
        api_key=os.environ["OPENAI_API_KEY"],
    )
    return model

def get_firecrawl_tools(max_retries=1, timeout=5):
    """Initialize Firecrawl tools with retry logic and timeout."""
    for attempt in range(max_retries):
        try:
            firecrawl_mcp_client = MCPClient(
                server_parameters={
                    "url": f"https://mcp.firecrawl.dev/{FIRECRAWL_API_KEY}/sse",
                    "timeout": timeout
                }
            )
            return firecrawl_mcp_client.get_tools()
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to initialize Firecrawl tools after {max_retries} attempts: {str(e)}")
                return []
            time.sleep(1)  # Wait before retrying

def get_agent(model):
    sitemap_mcp_client = MCPClient(
        server_parameters={
            "url": "https://a17o-sitemap-generator-mcp.hf.space/gradio_api/mcp/sse",
            "timeout": 30
        }
    )
    sitemap_tools = sitemap_mcp_client.get_tools()

    firecrawl_tools = get_firecrawl_tools()

    tools = [
        VisitWebpageTool(),
        GoogleSearchTool(),
        *sitemap_tools,
        *firecrawl_tools,
    ]

    description = """
    You are a helpful assistant that can generate a full llm txt file from a website.
    You can use the following tools to help you:
    - VisitWebpageTool: to visit a website
    - GoogleSearchTool: to search the web
    - SitemapTools: to generate a sitemap for a website
    - FirecrawlTools: to crawl a website, always use a timeout of 15000 milliseconds.

    For FirecrawlTools, use the parameters of the following shape:
    ```
    {
        "name": "firecrawl_scrape",
        "arguments": {
            "url": "https://example.com",
            "formats": ["markdown"],
            "onlyMainContent": true,
            "waitFor": 1000,
            "timeout": 15000, // milliseconds
            "mobile": false,
            "includeTags": ["article", "main"],
            "excludeTags": ["nav", "footer"],
            "skipTlsVerification": false
        }
    }
    ```

    You also have access to an tool to generate a sitemap for a website.
    You can use the sitemap to have a better understanding of the website structure.
    You will be given a website url and you will need to generate a full llm txt file from the website.
    You can search for llm txt files on the web using the GoogleSearchTool with the following query: "site:docs.* llm-full.txt".    
    """
    agent = CodeAgent(
        model=model,
        name="LLMFullTextGenerator",
        description=description,
        tools=tools,
    )
    return agent