Spaces:

frkhan
/

llm-web-scrapper

Running

App Files Files Community

frkhan commited on Sep 29

Commit

483c169

0 Parent(s):

Initial commit for LLM-Web-Scrapper project.

Browse files

Files changed (13) hide show

.env.example +3 -0
.gitignore +34 -0
Dockerfile +21 -0
LICENSE +21 -0
README.md +10 -0
app.py +107 -0
config.py +32 -0
crawl4ai_client.py +0 -0
docker-compose.dev.yml +17 -0
docker-compose.yml +16 -0
firecrawl_client.py +34 -0
llm_inference_service.py +40 -0
requirements.txt +21 -0

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
+NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
+FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Python artifacts
+__pycache__/
+*.pyc
+*.py[cod]
+*.egg
+*.egg-info/
+dist/
+build/
+venv/
+.env
+resources/
+# Chroma DB files
+chroma_db/
+*.sqlite3
+index/
+collections/
+# Environment files
+.env
+*.log
+# OS-specific
+.DS_Store
+Thumbs.db
+# Docker
+*.dockerfile
+*.tar
+*.log
+*.pid
+*.db
+*.sqlite3
+docker-compose.override.yml

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+# Use official Python base image
+FROM python:3.10-slim-bookworm
+# Set working directory
+WORKDIR /app
+# Upgrade system packages to patch vulnerabilities
+RUN apt-get update && apt-get upgrade -y && apt-get clean
+# Copy requirements and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy app code
+COPY . .
+# Expose Gradio default port
+EXPOSE 7860
+# Run the app
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 KI-IAN
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: LLM Web Scrapper
+emoji: 🤖
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 5.46.1
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import gradio as gr
+import firecrawl_client
+import llm_inference_service
+def parse_model_provider(selection):
+    # Expected format: "<model_name> (<provider>)"
+    if "(" in selection and ")" in selection:
+        model = selection.split(" (")[0].strip()
+        provider = selection.split(" (")[1].replace(")", "").strip()
+        return model, provider
+    raise ValueError(f"Invalid selection format: {selection}")
+def llm_response_wrapper(query, scrape_result, model_provider_selection):
+    model, provider = parse_model_provider(model_provider_selection)
+    result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
+    if not result or (isinstance(result, str) and result.strip() == ""):
+        return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
+    return result
+#Gradio UI
+with gr.Blocks() as gradio_ui:
+    gr.HTML("""
+    <div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
+        <h1 style="margin: 0;"> LLM Web Scraper</h1>
+        <div style="display: flex; gap: 10px;">
+            <a href="https://github.com/langchain-ai/langchain" target="_blank">
+                <img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain">
+            </a>
+            <a href="https://ai.google.dev/gemini-api/docs" target="_blank">
+                <img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API">
+            </a>
+            <a href="https://build.nvidia.com/models" target="_blank">
+                <img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM">
+            </a>
+            <a href="https://firecrawl.dev/" target="_blank">
+                <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
+            </a>
+            <a href="https://github.com/crawl4ai/crawl4ai" target="_blank">
+                <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
+            </a>
+        </div>
+    </div>
+    """)
+    gr.HTML("""
+    <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
+        <span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
+        <a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
+            <img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo">
+        </a>
+    </div>
+    """)
+    with gr.Column():
+        url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
+        # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
+        query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
+        scrape_btn = gr.Button("Scrape with FireCrawl")
+        scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
+        label_llm_section = gr.Label("Use LLM to extract information from the scraped content")
+        gr.HTML("<hr>")
+    with gr.Row():
+        # Add a single dropdown for model and provider selection
+        model_provider_dropdown = gr.Dropdown(
+            label="Select Model & Provider",
+            choices=[
+            "gemini-2.5-flash-lite (google_genai)",
+            "gemini-2.5-pro (google_genai)",
+            "gemini-2.5-flash (google_genai)",
+            "bytedance/seed-oss-36b-instruct (nvidia)",
+            "deepseek-ai/deepseek-v3.1 (nvidia)",
+            "qwen/qwen3-next-80b-a3b-instruct (nvidia)",
+            ],
+            value="gemini-2.5-flash-lite (google_genai)"
+        )
+        llm_response_btn = gr.Button("Extracted Info by LLM")
+    # LLM response output area and loader
+    llm_response = gr.Markdown(
+        "\n" * 9,  # 9 newlines + 1 line for empty content = 10 lines minimum
+        label="LLM Response",
+        show_copy_button=True,
+        visible=True
+    )
+    # Removed custom loader; Gradio will show a spinner automatically during processing.
+    scrape_btn.click(fn=firecrawl_client.scrape_and_get_markdown_with_firecrawl, inputs=url_input, outputs=scrape_result_textbox)
+    llm_response_btn.click(
+        fn=llm_response_wrapper,
+        inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
+        outputs=llm_response
+    )
+gradio_ui.launch(server_name="0.0.0.0")

config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+"""
+Configuration module for loading environment variables and secrets.
+- Loads environment variables from a `.env` file located in the project root if it exists else loads from Hugging Face's Secrets Tab.
+- Provides access to the following secrets:
+    - GOOGLE_API_KEY: API key for Gemini LLM.
+    - NVIDIA_API_KEY: API key for NVIDIA LLM.
+    - FIRE_CRAWL_API_KEY: API key for FireCrawl.
+- Prints warnings if required API keys are not set.
+"""
+from dotenv import load_dotenv
+#Load .env only if running locally
+env_path = os.path.join(os.path.dirname(__file__), '..', '.env')
+if os.path.exists(env_path):
+    load_dotenv(dotenv_path=env_path)
+# Access Secrets
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
+FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
+if not GOOGLE_API_KEY:
+    print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
+if not NVIDIA_API_KEY:
+    print("⚠️ Warning: NVIDIA_API_KEY is not set. NVIDIA LLM API may fail.")
+if not FIRE_CRAWL_API_KEY:
+    print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")

crawl4ai_client.py ADDED Viewed

File without changes

docker-compose.dev.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+version: '3.8'
+services:
+  semantic-search-app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: llm-web-scrapper
+    ports:
+      - "12200:7860"
+    environment:
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY}  # Load this key from .env in local/dev environment
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY}  # Load this key from .env in local/dev environment
+      - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}  # Load this key from .env in local/dev environment
+    volumes:
+      - .:/app:rw           # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
+    restart: unless-stopped

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+version: '3.8'
+services:
+  semantic-search-app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: llm-web-scrapper
+    ports:
+      - "12200:7860"
+    environment:
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY}  # Load this key from .env or manually add the secret
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY}  # Load this key from .env or manually add the secret
+      - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}  # Load this key from .env in local/dev environment
+    restart: unless-stopped

firecrawl_client.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from langchain_community.document_loaders import FireCrawlLoader
+from langchain_core.documents import Document
+from config import FIRE_CRAWL_API_KEY
+def scrape_with_firecrawl(url: str) -> list[Document]:
+    loader = FireCrawlLoader(url=url,
+                             api_key=FIRE_CRAWL_API_KEY,
+                             mode='scrape')
+    pages = []
+    for page in loader.lazy_load():  # type: ignore
+        pages.append(page)
+    return pages
+def get_markdown_from_documents(docs: list[Document]) -> str:
+    markdown_content = ""
+    for i, doc in enumerate(docs):
+        markdown_content += f"### Page {i+1}\n"
+        markdown_content += f"{doc.page_content}\n\n--------------\n\n"
+    return markdown_content
+def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
+    docs = scrape_with_firecrawl(url)
+    markdown = get_markdown_from_documents(docs)
+    return markdown

llm_inference_service.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from langchain.chat_models import init_chat_model
+def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
+    if not scraped_markdown_content:
+        return "No relevant information found to answer your question."
+    context = scraped_markdown_content
+    prompt = f"""
+    You are an expert assistant who can extract useful information from the content provided to you. Most of the time,
+    the content will be product pages from e-commerce websites. Users will ask you to extract product information such as product name, price, rating, etc.
+    Please provide your identity (model name and provider if applicable) at the beginning of your answer.
+    Use the following context to answer the user's question. Provide the final answer in a markdown table format if you are asked to extract product information.
+    If you can't extract anything useful provide in plain markdown format.
+    If user asks for JSON format, please provide the answer in JSON format only.
+    If you do not find or know the answer, do not hallucinate, do not try to generate fake answers.
+    If no Context is given, simply state "No relevant information found to answer your question."
+    Context:
+    {context}
+    Question:
+    {user_query}
+    Your Identity:
+    Answer:
+    """
+    llm = init_chat_model(model_name, model_provider=model_provider)
+    response = llm.invoke(prompt)
+    return response.content

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+# gradio==5.46.1
+# langchain==0.3.27
+# langchain-community==0.3.29
+# chromadb==1.1.0
+# PyMuPDF==1.26.4
+# langchain-google-genai==2.1.12
+# langchain-nvidia-ai-endpoints==0.3.18
+# dotenv==0.9.9
+gradio
+requests
+# python-dotenv
+dotenv
+firecrawl-py
+langchain-community
+langchain-google-genai
+langchain-nvidia-ai-endpoints
+# groq