import threading import queue import time import json import wikipedia from sentence_transformers import SentenceTransformer import faiss import gradio as gr import numpy as np import logging # Globals TOPICS = ["Art", "Science", "Technology", "Movies", "Sports", "Politics"] embedding_model = SentenceTransformer('all-MiniLM-L6-v2') index = faiss.IndexFlatL2(384) faiss_store = [] metadatas = [] # In-memory queue replacing Kafka article_queue = queue.Queue() # Links for sidebar hf_links = [ ("AI Reasoning Copilot", "https://huggingface.co/spaces/faisalsns/ai-reasoning-copilot"), ("Language Detection Compare Models", "https://huggingface.co/spaces/faisalsns/language-detection-compare-models/"), ("Prompt Playground v1 - Compare Models Output", "https://huggingface.co/spaces/faisalsns/prompt-canvas-engine"), ("Mental Disorders Symptoms", "https://huggingface.co/spaces/faisalsns/mental-disorders-symptoms") ] def get_links(): otherlinks = "
".join([f"[{name}]({url})" for name, url in hf_links]) return f"### Other Applications To Explore!\n{otherlinks}" # Wikipedia disambiguation handling def get_wikipedia_page(topic): """Get Wikipedia page with disambiguation handling""" try: return wikipedia.page(topic) except wikipedia.exceptions.DisambiguationError as e: # Try the first option from disambiguation return wikipedia.page(e.options[0]) except wikipedia.exceptions.PageError: # Try searching for alternatives search_results = wikipedia.search(topic, results=1) if search_results: return wikipedia.page(search_results[0]) raise Exception(f"No page found for topic: {topic}") # Agent 1: Scrape articles and push to queue def agent1_scrape_and_publish(selected_topics, count): if not selected_topics: return "Please select at least one topic." titles = [] count = min(count, 5) # Limit to prevent overload for topic in selected_topics: try: # Get the main page for the topic page = get_wikipedia_page(topic) content = page.content[:3000] # Limit content size article_queue.put({ "title": page.title, "content": content }) titles.append(page.title) # Also get related articles if count > 1 if count > 1: try: search_results = wikipedia.search(topic, results=count-1) for result in search_results[:count-1]: try: sub_page = wikipedia.page(result) sub_content = sub_page.content[:3000] article_queue.put({ "title": sub_page.title, "content": sub_content }) titles.append(sub_page.summary) except Exception: continue except Exception: pass except Exception as e: logging.error(f"Error fetching {topic}: {e}") titles.append(f"ERROR: {topic} - {str(e)}") success_count = len([t for t in titles if not t.startswith('ERROR')]) return f"Scraped {success_count} articles:\n" + "\n".join(titles) # Agent 2: Consume from queue and index def agent2_consume_and_index(): while True: try: article = article_queue.get(timeout=1) # Skip if already indexed if any(meta["title"] == article["title"] for meta in metadatas): continue content = article["content"] # Create embedding vector = embedding_model.encode(content) # Add to FAISS index faiss_store.append(vector) metadatas.append({ "title": article["title"], "content": content }) # Add to FAISS index (reshape to 2D array) index.add(np.array([vector])) logging.info(f"Indexed: {article['title']}") except queue.Empty: time.sleep(0.5) continue except Exception as e: logging.error(f"Error indexing article: {e}") continue # QA function def ask_question(question): if not faiss_store: return "Index is empty. Please scrape some articles first by selecting topics and clicking 'Generate from Wikipedia'." if not question or not question.strip(): return "Please enter a question." try: # Create query embedding query_vector = embedding_model.encode(question) # Search FAISS index D, I = index.search(np.array([query_vector]), k=3) if len(I[0]) == 0 or I[0][0] == -1: return "No relevant articles found for your question." # Get relevant content relevant_articles = [] for idx, score in zip(I[0], D[0]): if idx >= 0 and idx < len(metadatas): try: title = metadatas[idx]["title"] content = metadatas[idx]["content"] # Limit response length safely content_preview = content[:500] if len(content) > 500 else content relevant_articles.append(f"**{title}**:\n{content_preview}...") except (KeyError, IndexError) as e: logging.error(f"Error accessing metadata at index {idx}: {e}") continue if not relevant_articles: return "No relevant articles found." return f"Found {len(relevant_articles)} relevant articles:\n\n" + "\n\n".join(relevant_articles) except Exception as e: logging.error(f"Error in ask_question: {e}") return f"Error processing your question: {str(e)}" # Status function def get_status(): return f"Indexed Articles: {len(metadatas)}\nQueue Size: {article_queue.qsize()}" # Gradio UI with gr.Blocks(title="AI Wikipedia Copilot") as demo: gr.Markdown("## AI Copilot for Wikipedia") with gr.Row(): with gr.Column(scale=1): topic_list = gr.CheckboxGroup( choices=TOPICS, label="Select Topics", value=["Science"] # Default selection ) topic_count = gr.Slider( minimum=1, maximum=10, step=1, value=2, label="Articles per Topic" ) scrape_btn = gr.Button("Generate from Wikipedia", variant="primary") status_box = gr.Textbox(label="Status", lines=2) output_titles = gr.Textbox(label="Scraped Articles", lines=8) with gr.Column(scale=2): question_box = gr.Textbox( label="Ask a question about the scraped articles", placeholder="What is artificial intelligence?", lines=2 ) submit_btn = gr.Button("Submit Question", variant="primary") answer_box = gr.Textbox(label="Answer", lines=10) gr.Markdown( """ **How it works:** 1. Select topics and click 'Generate from Wikipedia' to scrape articles 2. Wait for indexing to complete (check status) 3. Ask questions about the scraped content **Note:** I am on a self-directed AI journey. This app scrapes Wikipedia, indexes articles in FAISS, and answers questions using embeddings. There's no better way to learn than build it yourself 🚀 """ ) gr.Markdown(get_links()) # Event handlers scrape_btn.click( fn=agent1_scrape_and_publish, inputs=[topic_list, topic_count], outputs=output_titles ) submit_btn.click( fn=ask_question, inputs=question_box, outputs=answer_box ) # Auto-refresh status every 2 seconds status_timer = gr.Timer(2.0) status_timer.tick(fn=get_status, outputs=status_box) if __name__ == "__main__": # Set up logging logging.basicConfig(level=logging.INFO) # Start the background indexing thread indexing_thread = threading.Thread(target=agent2_consume_and_index, daemon=True) indexing_thread.start() # Launch the app demo.launch(server_name="0.0.0.0", server_port=7860)