Spaces:
Paused
Paused
File size: 4,502 Bytes
9b5b26a c19d193 6aae614 d182c1f 5a420eb fd6f5bf 8fe992b 9b5b26a 57a46d9 cf0e5c7 f29c4cf 57a46d9 7a10d0b 34281b1 7a10d0b 9b5b26a d182c1f 74ac7b1 57a46d9 d182c1f 57a46d9 d182c1f 57a46d9 fd6f5bf a8f37fe e781b0c d182c1f fd6f5bf d182c1f e781b0c fd6f5bf e781b0c d182c1f e781b0c fd6f5bf 522a7c4 fd6f5bf d182c1f e781b0c fd6f5bf e781b0c d182c1f fd6f5bf d182c1f fd6f5bf d182c1f fd6f5bf d182c1f 9b5b26a 6aae614 ae7a494 e121372 bf6d34c 29ec968 fe328e0 13d500a 8c01ffb 861422e 9b5b26a 8c01ffb 8fe992b 57a46d9 8c01ffb 792fd17 861422e 8fe992b 9b5b26a 8c01ffb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
import datetime
import requests
import yaml
from tools.final_answer import FinalAnswerTool
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from Gradio_UI import GradioUI
# ✅ Tool wrapper function for SmolAgent
@tool
def scrape_drug_reviews_tool(drug_name: str, max_pages: int = 3) -> dict :
"""
Scrapes reviews from the website Drugs.com using Playwright for a given drug name.
Args:
drug_name: the name of the target drug for which I want to retrieve reviews,
max_pages: the number of pages of reviews from Drugs.com that I want to collect
Output: a dictionary url:review mapping the url of a review to the text of the review
"""
try:
df = scrape_drugs_com_reviews_requests(drug_name, max_pages)
return df.to_dict(orient="records")
except Exception as e:
return {"error": str(e)}
# List of User-Agents for rotation
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"
]
# Retry logic wrapper
def requests_retry_session(retries=3, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), session=None):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
# Scraper function using requests
def scrape_drugs_com_reviews_requests(drug_name, max_pages=3, delay=2):
base_url = f"https://www.drugs.com/comments/{drug_name}/"
all_reviews = []
session = requests_retry_session()
for page_num in range(1, max_pages + 1):
url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
headers = {"User-Agent": random.choice(USER_AGENTS)}
try:
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
review_blocks = soup.find_all("div", class_="ddc-comment ddc-box ddc-mgb-2")
if not review_blocks:
print(f"No reviews found on page {page_num}.")
break
for block in review_blocks:
review_paragraph = block.find("p")
review_text = None
if review_paragraph:
if review_paragraph.b:
review_paragraph.b.extract() # remove category (e.g., "For Back Pain")
review_text = review_paragraph.get_text(strip=True)
all_reviews.append({
"review": review_text,
"source": url
})
time.sleep(delay) # Polite delay
except Exception as e:
print(f"Error scraping {url}: {e}")
continue
return pd.DataFrame(all_reviews)
final_answer = FinalAnswerTool()
# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
custom_role_conversions=None,
)
with open("prompts.yaml", 'r') as stream:
prompt_templates = yaml.safe_load(stream)
agent = CodeAgent(
model=model,
tools=[scrape_drug_reviews_tool,final_answer], ## add your tools here (don't remove final answer)
max_steps=6,
verbosity_level=1,
grammar=None,
planning_interval=None,
name="DrugReviewScraperAgent",
description="Agent that can scrape drug reviews and analyze causal relations",
prompt_templates=prompt_templates
)
GradioUI(agent).launch() |