import gradio as gr import asyncio from main import WebScrapingOrchestrator import os orchestrator = WebScrapingOrchestrator() async def scrape_async(url): result = await orchestrator.process_url(url) if "error" in result: return { "Error": {result['error']}, } return { "URL": result.get("url"), "Title": result.get("title"), "Text Length": result["summary"]["text_length"], "Headings": result["llm_ready_data"]["key_headings"], "Main Topics": result["llm_ready_data"]["main_topics"], "Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..." } def scrape(url): """ Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON with a specific structure, and stores the result in a MongoDB database. Args: url (str): The URL of the webpage to scrape (e.g., 'https://example.com'). Returns: dict: A JSON-compatible dictionary containing the scraped content in the following format: - URL (str): The scraped webpage URL. - Title (str): The title of the webpage. - Text Length (int): The length of the extracted text. - Headings (list): Key headings extracted from the webpage. - Main Topics (list): Main topics identified in the content. - Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended. Notes: - Utilizes Playwright for browser automation to fetch and render the webpage. - The scraped data is processed into a structured JSON format suitable for LLM processing. - The resulting JSON is stored in a MongoDB collection for persistence. - This function wraps an asynchronous `scrape_async` function and runs it synchronously using `asyncio.run`. Example: >>> result = scrape("https://example.com") >>> print(result) { "URL": "https://example.com", "Title": "Example Page", "Text Length": 1234, "Headings": ["Heading 1", "Heading 2"], "Main Topics": ["Topic 1", "Topic 2"], "Summary (Short)": "This is a summary of the webpage content..." } """ return asyncio.run(scrape_async(url)) with gr.Blocks(title="MCP Web Scraper") as demo: gr.Markdown("### 🔍 MCP LLM Web Scraper") url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...") output = gr.JSON(label="Scraped & LLM-ready Content") scrape_button = gr.Button("Scrape Page") scrape_button.click(scrape, inputs=url_input, outputs=output) if __name__ == "__main__": #os.environ['no_proxy'] = 'localhost, 127.0.0.1, ::1' #demo.launch(server_name="0.0.0.0", server_port=7860) demo.launch(mcp_server=True)