Spaces:
Runtime error
Runtime error
File size: 3,714 Bytes
b4ccc57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
import asyncio
from crawl4ai import AsyncWebCrawler
from urllib.parse import urlparse
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_groq import ChatGroq
import re
import os
from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY=os.getenv("GROQ_API_KEY")
qa_chain = None
scraped_file = None
# Clean LLM output
class StrictOutputParser(StrOutputParser):
def parse(self, text: str) -> str:
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
text = re.sub(r'^(Reasoning|Thought|Analysis):.*?\n', '', text, flags=re.IGNORECASE)
return text.strip()
# Async crawl function
async def crawl_site(url):
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url)
return result.markdown
# UI-triggered scraper
def scrape_website(url):
global scraped_file
markdown = asyncio.run(crawl_site(url))
domain = urlparse(url).netloc.replace("www.", "")
filename = f"{domain}.txt"
with open(filename, "w", encoding="utf-8") as f:
f.write(markdown)
scraped_file = filename
return filename, markdown
# Query setup
def setup_qa():
global qa_chain
loader = TextLoader(scraped_file, encoding="utf-8")
docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(loader.load())
vectorstore = FAISS.from_documents(docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
retriever = vectorstore.as_retriever()
prompt = PromptTemplate.from_template("""
You are an AI assistant. Return ONLY the final answer.
**Rules (MUST follow):**
1. NO <think>, reasoning, or explanations.
2. NO markdown/formatting tags.
3. Answer in 3-4 concise sentences.
Context:
{context}
Question:
{question}
Answer (direct and short):""")
llm = ChatGroq(
api_key=GROQ_API_KEY, # Use environment variable for security
model="deepseek-r1-distill-llama-70b",
temperature=0.0
)
qa_chain = (
RunnableMap({
"context": retriever,
"question": RunnablePassthrough()
}) | prompt | llm | StrictOutputParser()
)
return "β
Query system ready!"
# Handle questions
def ask_question(query):
if not qa_chain:
return "β Please set up the QA system first."
return qa_chain.invoke(query)
# Gradio interface
with gr.Blocks(title="Web Scraping AI Agent") as demo:
gr.Markdown("## π Website Scraper AI Agent")
url_input = gr.Textbox(label="Enter Website URL")
scrape_btn = gr.Button("π Scrape Website")
download_output = gr.File(label="π Download Scraped File")
markdown_box = gr.Textbox(label="Scraped Text", lines=10)
setup_btn = gr.Button("π¬ Query This Website")
setup_status = gr.Textbox(label="Status")
query_input = gr.Textbox(label="Ask a Question")
query_btn = gr.Button("Ask")
query_output = gr.Textbox(label="Answer")
# Wire components
scrape_btn.click(fn=scrape_website, inputs=[url_input], outputs=[download_output, markdown_box])
setup_btn.click(fn=setup_qa, outputs=setup_status)
query_btn.click(fn=ask_question, inputs=[query_input], outputs=[query_output])
# Run
demo.launch(share=True)
|