Spaces:
Runtime error
Runtime error
import gradio as gr | |
import asyncio | |
from crawl4ai import AsyncWebCrawler | |
from urllib.parse import urlparse | |
from langchain_community.document_loaders import TextLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.prompts import PromptTemplate | |
from langchain.schema.runnable import RunnableMap, RunnablePassthrough | |
from langchain.schema.output_parser import StrOutputParser | |
from langchain_groq import ChatGroq | |
import re | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
GROQ_API_KEY=os.getenv("GROQ_API_KEY") | |
qa_chain = None | |
scraped_file = None | |
# Clean LLM output | |
class StrictOutputParser(StrOutputParser): | |
def parse(self, text: str) -> str: | |
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL) | |
text = re.sub(r'^(Reasoning|Thought|Analysis):.*?\n', '', text, flags=re.IGNORECASE) | |
return text.strip() | |
# Async crawl function | |
async def crawl_site(url): | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun(url=url) | |
return result.markdown | |
# UI-triggered scraper | |
def scrape_website(url): | |
global scraped_file | |
markdown = asyncio.run(crawl_site(url)) | |
domain = urlparse(url).netloc.replace("www.", "") | |
filename = f"{domain}.txt" | |
with open(filename, "w", encoding="utf-8") as f: | |
f.write(markdown) | |
scraped_file = filename | |
return filename, markdown | |
# Query setup | |
def setup_qa(): | |
global qa_chain | |
loader = TextLoader(scraped_file, encoding="utf-8") | |
docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(loader.load()) | |
vectorstore = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")) | |
retriever = vectorstore.as_retriever() | |
prompt = PromptTemplate.from_template(""" | |
You are an AI assistant. Return ONLY the final answer. | |
**Rules (MUST follow):** | |
1. NO <think>, reasoning, or explanations. | |
2. NO markdown/formatting tags. | |
3. Answer in 3-4 concise sentences. | |
Context: | |
{context} | |
Question: | |
{question} | |
Answer (direct and short):""") | |
llm = ChatGroq( | |
api_key=GROQ_API_KEY, # Use environment variable for security | |
model="deepseek-r1-distill-llama-70b", | |
temperature=0.0 | |
) | |
qa_chain = ( | |
RunnableMap({ | |
"context": retriever, | |
"question": RunnablePassthrough() | |
}) | prompt | llm | StrictOutputParser() | |
) | |
return "β Query system ready!" | |
# Handle questions | |
def ask_question(query): | |
if not qa_chain: | |
return "β Please set up the QA system first." | |
return qa_chain.invoke(query) | |
# Gradio interface | |
with gr.Blocks(title="Web Scraping AI Agent") as demo: | |
gr.Markdown("## π Website Scraper AI Agent") | |
url_input = gr.Textbox(label="Enter Website URL") | |
scrape_btn = gr.Button("π Scrape Website") | |
download_output = gr.File(label="π Download Scraped File") | |
markdown_box = gr.Textbox(label="Scraped Text", lines=10) | |
setup_btn = gr.Button("π¬ Query This Website") | |
setup_status = gr.Textbox(label="Status") | |
query_input = gr.Textbox(label="Ask a Question") | |
query_btn = gr.Button("Ask") | |
query_output = gr.Textbox(label="Answer") | |
# Wire components | |
scrape_btn.click(fn=scrape_website, inputs=[url_input], outputs=[download_output, markdown_box]) | |
setup_btn.click(fn=setup_qa, outputs=setup_status) | |
query_btn.click(fn=ask_question, inputs=[query_input], outputs=[query_output]) | |
# Run | |
demo.launch(share=True) | |