Spaces:

Shriharsh
/

Web_Content_QA

Running

File size: 5,995 Bytes

# Web Content Q&A Tool for Hugging Face Spaces
# Optimized for memory constraints (2GB RAM) and 24-hour timeline
# Features: Ingest up to 3 URLs, ask questions, get concise answers using DistilBERT

import gradio as gr
from bs4 import BeautifulSoup
import requests
from sentence_transformers import SentenceTransformer, util
import numpy as np
from transformers import pipeline

# Global variables for in-memory storage (reset on app restart)
corpus = []  # List of paragraphs from URLs
embeddings = None  # Precomputed embeddings for retrieval
sources_list = []  # Source URLs for each paragraph

# Load models at startup (memory: ~340MB total)
# Retrieval model: all-MiniLM-L6-v2 (~80MB, 384-dim embeddings)
retriever = SentenceTransformer('all-MiniLM-L6-v2')
# QA model: DistilBERT fine-tuned on SQuAD (~260MB)
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def ingest_urls(urls):
    """
    Ingest up to 3 URLs, scrape content, and compute embeddings.
    Limits: 100 paragraphs per URL to manage memory (~0.5MB embeddings total).
    """
    global corpus, embeddings, sources_list
    # Clear previous data
    corpus.clear()
    sources_list.clear()
    embeddings = None
    
    # Parse URLs from input (one per line, max 3)
    url_list = [url.strip() for url in urls.split("\n") if url.strip()][:3]
    if not url_list:
        return "Error: Please enter at least one valid URL."
    
    # Headers to mimic browser and avoid blocking
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    
    # Scrape each URL
    for url in url_list:
        try:
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()  # Raise exception for bad status codes
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract content from <p> and <div> tags for broader coverage
            elements = soup.find_all(['p', 'div'])
            paragraph_count = 0
            for elem in elements:
                text = elem.get_text().strip()
                # Filter short or empty text
                if text and len(text) > 20 and paragraph_count < 100:
                    corpus.append(text)
                    sources_list.append(url)
                    paragraph_count += 1
            if paragraph_count == 0:
                return f"Warning: No usable content found at {url}."
        except Exception as e:
            return f"Error ingesting {url}: {str(e)}. Check URL and try again."
    
    # Compute embeddings if content was ingested
    if corpus:
        # Embeddings: ~1.5KB per paragraph, ~450KB for 300 paragraphs
        embeddings = retriever.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
        return f"Success: Ingested {len(corpus)} paragraphs from {len(set(url_list))} URLs."
    return "Error: No valid content ingested."

def answer_question(question):
    """
    Answer a question using retrieved context and DistilBERT QA.
    Retrieves top 3 paragraphs to provide broader context for cross-questioning.
    If total context exceeds 512 tokens (DistilBERT's max length), it will be truncated automatically.
    """
    global corpus, embeddings, sources_list
    if not corpus or embeddings is None:
        return "Error: Please ingest URLs first."
    
    # Encode question into embedding
    question_embedding = retriever.encode(question, convert_to_tensor=True)
    
    # Compute cosine similarity with stored embeddings
    cos_scores = util.cos_sim(question_embedding, embeddings)[0]
    top_k = min(3, len(corpus))  # Get top 3 or less if fewer paragraphs
    top_indices = np.argsort(-cos_scores)[:top_k]
    
    # Retrieve context (top 3 paragraphs)
    contexts = [corpus[i] for i in top_indices]
    context = " ".join(contexts)  # Concatenate with space
    sources = [sources_list[i] for i in top_indices]
    
    # Extract answer with DistilBERT
    # Note: If total tokens exceed 512, it will be truncated automatically
    result = qa_model(question=question, context=context)
    answer = result['answer']
    confidence = result['score']
    
    # Format response with answer, confidence, and sources
    sources_str = "\n".join(set(sources))  # Unique sources
    return f"Answer: {answer}\nConfidence: {confidence:.2f}\nSources:\n{sources_str}"

def clear_all():
    """Clear all inputs and outputs for a fresh start."""
    global corpus, embeddings, sources_list
    corpus.clear()
    embeddings = None
    sources_list.clear()
    return "", "", ""

# Gradio UI with minimal, user-friendly design
with gr.Blocks(title="Web Content Q&A Tool") as demo:
    gr.Markdown(
        """
        # Web Content Q&A Tool
        Enter up to 3 URLs (one per line), ingest their content, and ask questions.
        Answers are generated using only the ingested data. Note: Data resets on app restart.
        """
    )
    
    # URL input and ingestion
    with gr.Row():
        url_input = gr.Textbox(label="Enter URLs (one per line, max 3)", lines=3, placeholder="https://example.com")
        with gr.Column():
            ingest_btn = gr.Button("Ingest URLs")
            clear_btn = gr.Button("Clear All")
    ingest_output = gr.Textbox(label="Ingestion Status", interactive=False)
    
    # Question input and answer
    with gr.Row():
        question_input = gr.Textbox(label="Ask a question", placeholder="What is this about?")
        ask_btn = gr.Button("Ask")
    answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)
    
    # Bind functions to buttons
    ingest_btn.click(fn=ingest_urls, inputs=url_input, outputs=ingest_output)
    ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)
    clear_btn.click(fn=clear_all, inputs=None, outputs=[url_input, ingest_output, answer_output])

# Launch the app (HF Spaces expects port 7860)
demo.launch(share = True, server_name="0.0.0.0", server_port=7860)