Eduardo Guerra
commited on
Commit
·
eed0f02
1
Parent(s):
39c6564
feat: Added web_scrapper_tool
Browse files- Dockerfile +46 -0
- app.py +1 -0
- requirements.txt +1 -0
- src/agent.py +22 -3
- src/tools/__init__.py +0 -0
- src/tools/web_scrapper.py +23 -0
Dockerfile
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
libx11-xcb1 \
|
| 6 |
+
libxcomposite1 \
|
| 7 |
+
libxrandr2 \
|
| 8 |
+
libasound2 \
|
| 9 |
+
libatk-bridge2.0-0 \
|
| 10 |
+
libatk1.0-0 \
|
| 11 |
+
libcups2 \
|
| 12 |
+
libdbus-1-3 \
|
| 13 |
+
libgdk-pixbuf2.0-0 \
|
| 14 |
+
libnspr4 \
|
| 15 |
+
libnss3 \
|
| 16 |
+
libxss1 \
|
| 17 |
+
libxtst6 \
|
| 18 |
+
lsb-release \
|
| 19 |
+
wget \
|
| 20 |
+
ca-certificates \
|
| 21 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Install Playwright and browsers
|
| 24 |
+
RUN pip install --upgrade pip
|
| 25 |
+
RUN pip install playwright
|
| 26 |
+
RUN playwright install --with-deps
|
| 27 |
+
|
| 28 |
+
# Set up a non-root user
|
| 29 |
+
RUN useradd -m -u 1000 user
|
| 30 |
+
USER user
|
| 31 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 32 |
+
|
| 33 |
+
# Set the working directory
|
| 34 |
+
WORKDIR /home/user/app
|
| 35 |
+
|
| 36 |
+
# Copy the application files
|
| 37 |
+
COPY . .
|
| 38 |
+
|
| 39 |
+
# Install Python dependencies
|
| 40 |
+
RUN pip install -r requirements.txt
|
| 41 |
+
|
| 42 |
+
# Expose the port
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# Command to run the application
|
| 46 |
+
CMD ["python", "app.py"]
|
app.py
CHANGED
|
@@ -139,6 +139,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 139 |
logger.info(
|
| 140 |
f"Answer for task {task_id}: {submitted_answer}"
|
| 141 |
)
|
|
|
|
| 142 |
answers_payload.append(
|
| 143 |
{
|
| 144 |
"task_id": task_id,
|
|
|
|
| 139 |
logger.info(
|
| 140 |
f"Answer for task {task_id}: {submitted_answer}"
|
| 141 |
)
|
| 142 |
+
|
| 143 |
answers_payload.append(
|
| 144 |
{
|
| 145 |
"task_id": task_id,
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
duckduckgo-search==8.0.1
|
| 2 |
gradio
|
| 3 |
langchain-core==0.3.56
|
|
|
|
| 1 |
+
beautifulsoup4==4.13.4
|
| 2 |
duckduckgo-search==8.0.1
|
| 3 |
gradio
|
| 4 |
langchain-core==0.3.56
|
src/agent.py
CHANGED
|
@@ -13,6 +13,8 @@ from langchain_core.messages import AIMessage, HumanMessage
|
|
| 13 |
from langchain_core.prompts import ChatPromptTemplate
|
| 14 |
from langchain_openai import ChatOpenAI
|
| 15 |
|
|
|
|
|
|
|
| 16 |
nest_asyncio.apply()
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
@@ -24,7 +26,10 @@ class BasicAgent:
|
|
| 24 |
|
| 25 |
prompt = ChatPromptTemplate.from_messages(
|
| 26 |
[
|
| 27 |
-
(
|
|
|
|
|
|
|
|
|
|
| 28 |
("placeholder", "{chat_history}"),
|
| 29 |
("human", "{input}"),
|
| 30 |
("placeholder", "{agent_scratchpad}"),
|
|
@@ -52,7 +57,7 @@ class BasicAgent:
|
|
| 52 |
# )
|
| 53 |
# tools = toolkit.get_tools()
|
| 54 |
|
| 55 |
-
tools = [DuckDuckGoSearchResults()]
|
| 56 |
logger.info(f"Tools: {tools}")
|
| 57 |
|
| 58 |
agent = create_tool_calling_agent(llm, tools, prompt)
|
|
@@ -70,7 +75,21 @@ class BasicAgent:
|
|
| 70 |
def __call__(self, question: str) -> str:
|
| 71 |
try:
|
| 72 |
logger.info(f"Processing question: {question}")
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
logger.info(f"Response: {response}")
|
| 75 |
return response
|
| 76 |
except Exception as e:
|
|
|
|
| 13 |
from langchain_core.prompts import ChatPromptTemplate
|
| 14 |
from langchain_openai import ChatOpenAI
|
| 15 |
|
| 16 |
+
from src.tools.web_scrapper import web_scrapper_tool
|
| 17 |
+
|
| 18 |
nest_asyncio.apply()
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
|
|
|
| 26 |
|
| 27 |
prompt = ChatPromptTemplate.from_messages(
|
| 28 |
[
|
| 29 |
+
(
|
| 30 |
+
"system",
|
| 31 |
+
"You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise, additionally, only use numbers, don't add any units and don't use any other characters. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.",
|
| 32 |
+
),
|
| 33 |
("placeholder", "{chat_history}"),
|
| 34 |
("human", "{input}"),
|
| 35 |
("placeholder", "{agent_scratchpad}"),
|
|
|
|
| 57 |
# )
|
| 58 |
# tools = toolkit.get_tools()
|
| 59 |
|
| 60 |
+
tools = [DuckDuckGoSearchResults(), web_scrapper_tool()]
|
| 61 |
logger.info(f"Tools: {tools}")
|
| 62 |
|
| 63 |
agent = create_tool_calling_agent(llm, tools, prompt)
|
|
|
|
| 75 |
def __call__(self, question: str) -> str:
|
| 76 |
try:
|
| 77 |
logger.info(f"Processing question: {question}")
|
| 78 |
+
|
| 79 |
+
retries = 3
|
| 80 |
+
while retries > 0:
|
| 81 |
+
try:
|
| 82 |
+
response = self.agent_executor.invoke({"input": question})[
|
| 83 |
+
"output"
|
| 84 |
+
]
|
| 85 |
+
response = response.split("FINAL ANSWER:")[1].strip()
|
| 86 |
+
break
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(
|
| 89 |
+
f"Error processing question: {e}", exc_info=True
|
| 90 |
+
)
|
| 91 |
+
response = "Could not process question"
|
| 92 |
+
retries -= 1
|
| 93 |
logger.info(f"Response: {response}")
|
| 94 |
return response
|
| 95 |
except Exception as e:
|
src/tools/__init__.py
ADDED
|
File without changes
|
src/tools/web_scrapper.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from langgraph import Tool
|
| 3 |
+
from playwright.sync_api import sync_playwright
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def extract_website_content(url: str) -> str:
|
| 7 |
+
with sync_playwright() as p:
|
| 8 |
+
browser = p.chromium.launch(headless=True)
|
| 9 |
+
page = browser.new_page()
|
| 10 |
+
page.goto(url)
|
| 11 |
+
html_content = page.content()
|
| 12 |
+
browser.close()
|
| 13 |
+
|
| 14 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 15 |
+
return soup.get_text()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def web_scrapper_tool():
|
| 19 |
+
return Tool.from_function(
|
| 20 |
+
func=extract_website_content,
|
| 21 |
+
name="scrape_website",
|
| 22 |
+
description="Extracts the main content of a webpage given its URL.",
|
| 23 |
+
)
|