DrugReviewRetriever

Paused

File size: 4,502 Bytes

9b5b26a
 
 
c19d193
6aae614
d182c1f
 
 
5a420eb
fd6f5bf
 
 
8fe992b
9b5b26a
 
57a46d9
 
cf0e5c7
f29c4cf
57a46d9
7a10d0b
 
34281b1
 
7a10d0b
 
9b5b26a
d182c1f
74ac7b1
57a46d9
d182c1f
57a46d9
d182c1f
 
57a46d9
fd6f5bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8f37fe
e781b0c
d182c1f
fd6f5bf
 
 
 
 
 
 
 
 
 
 
 
d182c1f
e781b0c
fd6f5bf
e781b0c
d182c1f
e781b0c
fd6f5bf
 
522a7c4
 
fd6f5bf
 
d182c1f
e781b0c
fd6f5bf
 
e781b0c
d182c1f
fd6f5bf
d182c1f
fd6f5bf
 
 
d182c1f
fd6f5bf
d182c1f
9b5b26a
6aae614
ae7a494
 
 
 
e121372
bf6d34c
 
29ec968
fe328e0
13d500a
8c01ffb
 
 
861422e
 
9b5b26a
8c01ffb
8fe992b
57a46d9
8c01ffb
 
 
 
792fd17
 
861422e
8fe992b
 
9b5b26a
8c01ffb

from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
import datetime
import requests
import yaml
from tools.final_answer import FinalAnswerTool
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from Gradio_UI import GradioUI


# ✅ Tool wrapper function for SmolAgent
@tool
def scrape_drug_reviews_tool(drug_name: str, max_pages: int = 3) -> dict :
    """
    Scrapes reviews from the website Drugs.com using Playwright for a given drug name.
    Args:
        drug_name: the name of the target drug for which I want to retrieve reviews,
        max_pages: the number of pages of reviews from Drugs.com that I want to collect

    Output: a dictionary url:review mapping the url of a review to the text of the review
    """
    try:
        df = scrape_drugs_com_reviews_requests(drug_name, max_pages)
        return df.to_dict(orient="records")
    except Exception as e:
        return {"error": str(e)}


        
# List of User-Agents for rotation
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"
]

# Retry logic wrapper
def requests_retry_session(retries=3, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), session=None):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

# Scraper function using requests
def scrape_drugs_com_reviews_requests(drug_name, max_pages=3, delay=2):
    base_url = f"https://www.drugs.com/comments/{drug_name}/"
    all_reviews = []

    session = requests_retry_session()

    for page_num in range(1, max_pages + 1):
        url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
        headers = {"User-Agent": random.choice(USER_AGENTS)}

        try:
            response = session.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")
            review_blocks = soup.find_all("div", class_="ddc-comment ddc-box ddc-mgb-2")

            if not review_blocks:
                print(f"No reviews found on page {page_num}.")
                break

            for block in review_blocks:
                review_paragraph = block.find("p")
                review_text = None
                if review_paragraph:
                    if review_paragraph.b:
                        review_paragraph.b.extract()  # remove category (e.g., "For Back Pain")
                    review_text = review_paragraph.get_text(strip=True)

                all_reviews.append({
                    "review": review_text,
                    "source": url
                })

            time.sleep(delay)  # Polite delay

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            continue

    return pd.DataFrame(all_reviews)


final_answer = FinalAnswerTool()

# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud' 

model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
custom_role_conversions=None,
)



with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
    
agent = CodeAgent(
    model=model,
    tools=[scrape_drug_reviews_tool,final_answer], ## add your tools here (don't remove final answer)
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name="DrugReviewScraperAgent",
    description="Agent that can scrape drug reviews and analyze causal relations",
    prompt_templates=prompt_templates
)


GradioUI(agent).launch()