WebScraperAgent / app.py
Aniket00's picture
Upload 3 files
b4ccc57 verified
import gradio as gr
import asyncio
from crawl4ai import AsyncWebCrawler
from urllib.parse import urlparse
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_groq import ChatGroq
import re
import os
from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY=os.getenv("GROQ_API_KEY")
qa_chain = None
scraped_file = None
# Clean LLM output
class StrictOutputParser(StrOutputParser):
def parse(self, text: str) -> str:
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
text = re.sub(r'^(Reasoning|Thought|Analysis):.*?\n', '', text, flags=re.IGNORECASE)
return text.strip()
# Async crawl function
async def crawl_site(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url)
return result.markdown
# UI-triggered scraper
def scrape_website(url):
global scraped_file
markdown = asyncio.run(crawl_site(url))
domain = urlparse(url).netloc.replace("www.", "")
filename = f"{domain}.txt"
with open(filename, "w", encoding="utf-8") as f:
f.write(markdown)
scraped_file = filename
return filename, markdown
# Query setup
def setup_qa():
global qa_chain
loader = TextLoader(scraped_file, encoding="utf-8")
docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(loader.load())
vectorstore = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template("""
You are an AI assistant. Return ONLY the final answer.
**Rules (MUST follow):**
1. NO <think>, reasoning, or explanations.
2. NO markdown/formatting tags.
3. Answer in 3-4 concise sentences.
Context:
{context}
Question:
{question}
Answer (direct and short):""")
llm = ChatGroq(
api_key=GROQ_API_KEY, # Use environment variable for security
model="deepseek-r1-distill-llama-70b",
temperature=0.0
)
qa_chain = (
RunnableMap({
"context": retriever,
"question": RunnablePassthrough()
}) | prompt | llm | StrictOutputParser()
)
return "βœ… Query system ready!"
# Handle questions
def ask_question(query):
if not qa_chain:
return "❗ Please set up the QA system first."
return qa_chain.invoke(query)
# Gradio interface
with gr.Blocks(title="Web Scraping AI Agent") as demo:
gr.Markdown("## 🌐 Website Scraper AI Agent")
url_input = gr.Textbox(label="Enter Website URL")
scrape_btn = gr.Button("πŸ” Scrape Website")
download_output = gr.File(label="πŸ“„ Download Scraped File")
markdown_box = gr.Textbox(label="Scraped Text", lines=10)
setup_btn = gr.Button("πŸ’¬ Query This Website")
setup_status = gr.Textbox(label="Status")
query_input = gr.Textbox(label="Ask a Question")
query_btn = gr.Button("Ask")
query_output = gr.Textbox(label="Answer")
# Wire components
scrape_btn.click(fn=scrape_website, inputs=[url_input], outputs=[download_output, markdown_box])
setup_btn.click(fn=setup_qa, outputs=setup_status)
query_btn.click(fn=ask_question, inputs=[query_input], outputs=[query_output])
# Run
demo.launch(share=True)