Eduardo Guerra commited on
Commit
eed0f02
·
1 Parent(s): 39c6564

feat: Added web_scrapper_tool

Browse files
Files changed (6) hide show
  1. Dockerfile +46 -0
  2. app.py +1 -0
  3. requirements.txt +1 -0
  4. src/agent.py +22 -3
  5. src/tools/__init__.py +0 -0
  6. src/tools/web_scrapper.py +23 -0
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ libx11-xcb1 \
6
+ libxcomposite1 \
7
+ libxrandr2 \
8
+ libasound2 \
9
+ libatk-bridge2.0-0 \
10
+ libatk1.0-0 \
11
+ libcups2 \
12
+ libdbus-1-3 \
13
+ libgdk-pixbuf2.0-0 \
14
+ libnspr4 \
15
+ libnss3 \
16
+ libxss1 \
17
+ libxtst6 \
18
+ lsb-release \
19
+ wget \
20
+ ca-certificates \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Install Playwright and browsers
24
+ RUN pip install --upgrade pip
25
+ RUN pip install playwright
26
+ RUN playwright install --with-deps
27
+
28
+ # Set up a non-root user
29
+ RUN useradd -m -u 1000 user
30
+ USER user
31
+ ENV PATH="/home/user/.local/bin:$PATH"
32
+
33
+ # Set the working directory
34
+ WORKDIR /home/user/app
35
+
36
+ # Copy the application files
37
+ COPY . .
38
+
39
+ # Install Python dependencies
40
+ RUN pip install -r requirements.txt
41
+
42
+ # Expose the port
43
+ EXPOSE 7860
44
+
45
+ # Command to run the application
46
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -139,6 +139,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
139
  logger.info(
140
  f"Answer for task {task_id}: {submitted_answer}"
141
  )
 
142
  answers_payload.append(
143
  {
144
  "task_id": task_id,
 
139
  logger.info(
140
  f"Answer for task {task_id}: {submitted_answer}"
141
  )
142
+
143
  answers_payload.append(
144
  {
145
  "task_id": task_id,
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  duckduckgo-search==8.0.1
2
  gradio
3
  langchain-core==0.3.56
 
1
+ beautifulsoup4==4.13.4
2
  duckduckgo-search==8.0.1
3
  gradio
4
  langchain-core==0.3.56
src/agent.py CHANGED
@@ -13,6 +13,8 @@ from langchain_core.messages import AIMessage, HumanMessage
13
  from langchain_core.prompts import ChatPromptTemplate
14
  from langchain_openai import ChatOpenAI
15
 
 
 
16
  nest_asyncio.apply()
17
  logger = logging.getLogger(__name__)
18
 
@@ -24,7 +26,10 @@ class BasicAgent:
24
 
25
  prompt = ChatPromptTemplate.from_messages(
26
  [
27
- ("system", "You are a helpful assistant"),
 
 
 
28
  ("placeholder", "{chat_history}"),
29
  ("human", "{input}"),
30
  ("placeholder", "{agent_scratchpad}"),
@@ -52,7 +57,7 @@ class BasicAgent:
52
  # )
53
  # tools = toolkit.get_tools()
54
 
55
- tools = [DuckDuckGoSearchResults()]
56
  logger.info(f"Tools: {tools}")
57
 
58
  agent = create_tool_calling_agent(llm, tools, prompt)
@@ -70,7 +75,21 @@ class BasicAgent:
70
  def __call__(self, question: str) -> str:
71
  try:
72
  logger.info(f"Processing question: {question}")
73
- response = self.agent_executor.invoke({"input": question})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  logger.info(f"Response: {response}")
75
  return response
76
  except Exception as e:
 
13
  from langchain_core.prompts import ChatPromptTemplate
14
  from langchain_openai import ChatOpenAI
15
 
16
+ from src.tools.web_scrapper import web_scrapper_tool
17
+
18
  nest_asyncio.apply()
19
  logger = logging.getLogger(__name__)
20
 
 
26
 
27
  prompt = ChatPromptTemplate.from_messages(
28
  [
29
+ (
30
+ "system",
31
+ "You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise, additionally, only use numbers, don't add any units and don't use any other characters. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.",
32
+ ),
33
  ("placeholder", "{chat_history}"),
34
  ("human", "{input}"),
35
  ("placeholder", "{agent_scratchpad}"),
 
57
  # )
58
  # tools = toolkit.get_tools()
59
 
60
+ tools = [DuckDuckGoSearchResults(), web_scrapper_tool()]
61
  logger.info(f"Tools: {tools}")
62
 
63
  agent = create_tool_calling_agent(llm, tools, prompt)
 
75
  def __call__(self, question: str) -> str:
76
  try:
77
  logger.info(f"Processing question: {question}")
78
+
79
+ retries = 3
80
+ while retries > 0:
81
+ try:
82
+ response = self.agent_executor.invoke({"input": question})[
83
+ "output"
84
+ ]
85
+ response = response.split("FINAL ANSWER:")[1].strip()
86
+ break
87
+ except Exception as e:
88
+ logger.error(
89
+ f"Error processing question: {e}", exc_info=True
90
+ )
91
+ response = "Could not process question"
92
+ retries -= 1
93
  logger.info(f"Response: {response}")
94
  return response
95
  except Exception as e:
src/tools/__init__.py ADDED
File without changes
src/tools/web_scrapper.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from langgraph import Tool
3
+ from playwright.sync_api import sync_playwright
4
+
5
+
6
+ def extract_website_content(url: str) -> str:
7
+ with sync_playwright() as p:
8
+ browser = p.chromium.launch(headless=True)
9
+ page = browser.new_page()
10
+ page.goto(url)
11
+ html_content = page.content()
12
+ browser.close()
13
+
14
+ soup = BeautifulSoup(html_content, "html.parser")
15
+ return soup.get_text()
16
+
17
+
18
+ def web_scrapper_tool():
19
+ return Tool.from_function(
20
+ func=extract_website_content,
21
+ name="scrape_website",
22
+ description="Extracts the main content of a webpage given its URL.",
23
+ )