frkhan commited on
Commit
483c169
·
0 Parent(s):

Initial commit for LLM-Web-Scrapper project.

Browse files
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GOOGLE_API_KEY="YOUR-GEMINI-API-KEY"
2
+ NVIDIA_API_KEY="YOUR-NVIDIA-API-KEY"
3
+ FIRECRAWL_API_KEY="YOUR-FIRECRAWL-API-KEY"
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python artifacts
2
+ __pycache__/
3
+ *.pyc
4
+ *.py[cod]
5
+ *.egg
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ venv/
10
+ .env
11
+ resources/
12
+
13
+ # Chroma DB files
14
+ chroma_db/
15
+ *.sqlite3
16
+ index/
17
+ collections/
18
+
19
+ # Environment files
20
+ .env
21
+ *.log
22
+
23
+ # OS-specific
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Docker
28
+ *.dockerfile
29
+ *.tar
30
+ *.log
31
+ *.pid
32
+ *.db
33
+ *.sqlite3
34
+ docker-compose.override.yml
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python base image
2
+ FROM python:3.10-slim-bookworm
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Upgrade system packages to patch vulnerabilities
8
+ RUN apt-get update && apt-get upgrade -y && apt-get clean
9
+
10
+ # Copy requirements and install dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy app code
15
+ COPY . .
16
+
17
+ # Expose Gradio default port
18
+ EXPOSE 7860
19
+
20
+ # Run the app
21
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 KI-IAN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: LLM Web Scrapper
3
+ emoji: 🤖
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.46.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import firecrawl_client
4
+ import llm_inference_service
5
+
6
+ def parse_model_provider(selection):
7
+ # Expected format: "<model_name> (<provider>)"
8
+ if "(" in selection and ")" in selection:
9
+ model = selection.split(" (")[0].strip()
10
+ provider = selection.split(" (")[1].replace(")", "").strip()
11
+ return model, provider
12
+ raise ValueError(f"Invalid selection format: {selection}")
13
+
14
+ def llm_response_wrapper(query, scrape_result, model_provider_selection):
15
+ model, provider = parse_model_provider(model_provider_selection)
16
+ result = llm_inference_service.extract_page_info_by_llm(query, scrape_result, model, provider)
17
+ if not result or (isinstance(result, str) and result.strip() == ""):
18
+ return "❌ <span style='color:red;'>No information could be extracted from the scraped content. Please check your query or try a different model/provider.</span>"
19
+ return result
20
+
21
+ #Gradio UI
22
+ with gr.Blocks() as gradio_ui:
23
+ gr.HTML("""
24
+ <div style="display: flex; align-items: center; gap: 20px; flex-wrap: wrap; margin-bottom: 20px;">
25
+ <h1 style="margin: 0;"> LLM Web Scraper</h1>
26
+ <div style="display: flex; gap: 10px;">
27
+ <a href="https://github.com/langchain-ai/langchain" target="_blank">
28
+ <img src="https://img.shields.io/badge/LangChain-Framework-blue?logo=langchain" alt="LangChain">
29
+ </a>
30
+ <a href="https://ai.google.dev/gemini-api/docs" target="_blank">
31
+ <img src="https://img.shields.io/badge/Gemini%20API-Google-blue?logo=google" alt="Gemini API">
32
+ </a>
33
+ <a href="https://build.nvidia.com/models" target="_blank">
34
+ <img src="https://img.shields.io/badge/NVIDIA%20NIM-API-green?logo=nvidia" alt="NVIDIA NIM">
35
+ </a>
36
+ <a href="https://firecrawl.dev/" target="_blank">
37
+ <img src="https://img.shields.io/badge/FireCrawl-Web%20Scraper-orange?logo=fire" alt="FireCrawl">
38
+ </a>
39
+ <a href="https://github.com/crawl4ai/crawl4ai" target="_blank">
40
+ <img src="https://img.shields.io/badge/Crawl4AI-Web%20Scraper-blueviolet?logo=github" alt="Crawl4AI">
41
+ </a>
42
+
43
+ </div>
44
+ </div>
45
+ """)
46
+
47
+ gr.HTML("""
48
+ <div style="display: flex; align-items: center; gap: 10px; margin-bottom: 20px;">
49
+ <span style="font-size: 16px;">📦 <strong>Download the full source code:</strong></span>
50
+ <a href="https://github.com/KI-IAN/llm-web-scrapper" target="_blank">
51
+ <img src="https://img.shields.io/badge/GitHub-View%20Repo-blue?logo=github" alt="GitHub Repo">
52
+ </a>
53
+ </div>
54
+ """)
55
+
56
+
57
+ with gr.Column():
58
+ url_input = gr.Textbox(label="Enter URL to scrape", placeholder="https://example.com/query?search=cat+food", lines=1)
59
+ # search_query_input = gr.Textbox(label="Enter your query", placeholder="Paw paw fish adult cat food", lines=1)
60
+ query_input = gr.Textbox(label="What information do you want to find?", placeholder="Find product name, price, rating", lines=1)
61
+ scrape_btn = gr.Button("Scrape with FireCrawl")
62
+
63
+ scrape_result_textbox = gr.Textbox(label="Scrape Result", lines=20, show_copy_button=True)
64
+
65
+ label_llm_section = gr.Label("Use LLM to extract information from the scraped content")
66
+ gr.HTML("<hr>")
67
+
68
+
69
+ with gr.Row():
70
+
71
+ # Add a single dropdown for model and provider selection
72
+ model_provider_dropdown = gr.Dropdown(
73
+ label="Select Model & Provider",
74
+ choices=[
75
+ "gemini-2.5-flash-lite (google_genai)",
76
+ "gemini-2.5-pro (google_genai)",
77
+ "gemini-2.5-flash (google_genai)",
78
+ "bytedance/seed-oss-36b-instruct (nvidia)",
79
+ "deepseek-ai/deepseek-v3.1 (nvidia)",
80
+ "qwen/qwen3-next-80b-a3b-instruct (nvidia)",
81
+ ],
82
+ value="gemini-2.5-flash-lite (google_genai)"
83
+ )
84
+
85
+
86
+ llm_response_btn = gr.Button("Extracted Info by LLM")
87
+
88
+
89
+ # LLM response output area and loader
90
+ llm_response = gr.Markdown(
91
+ "\n" * 9, # 9 newlines + 1 line for empty content = 10 lines minimum
92
+ label="LLM Response",
93
+ show_copy_button=True,
94
+ visible=True
95
+ )
96
+ # Removed custom loader; Gradio will show a spinner automatically during processing.
97
+
98
+
99
+ scrape_btn.click(fn=firecrawl_client.scrape_and_get_markdown_with_firecrawl, inputs=url_input, outputs=scrape_result_textbox)
100
+
101
+ llm_response_btn.click(
102
+ fn=llm_response_wrapper,
103
+ inputs=[query_input, scrape_result_textbox, model_provider_dropdown],
104
+ outputs=llm_response
105
+ )
106
+
107
+ gradio_ui.launch(server_name="0.0.0.0")
config.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ """
3
+ Configuration module for loading environment variables and secrets.
4
+ - Loads environment variables from a `.env` file located in the project root if it exists else loads from Hugging Face's Secrets Tab.
5
+ - Provides access to the following secrets:
6
+ - GOOGLE_API_KEY: API key for Gemini LLM.
7
+ - NVIDIA_API_KEY: API key for NVIDIA LLM.
8
+ - FIRE_CRAWL_API_KEY: API key for FireCrawl.
9
+ - Prints warnings if required API keys are not set.
10
+ """
11
+ from dotenv import load_dotenv
12
+
13
+ #Load .env only if running locally
14
+ env_path = os.path.join(os.path.dirname(__file__), '..', '.env')
15
+
16
+ if os.path.exists(env_path):
17
+ load_dotenv(dotenv_path=env_path)
18
+
19
+
20
+ # Access Secrets
21
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
22
+ NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
23
+ FIRE_CRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
24
+
25
+ if not GOOGLE_API_KEY:
26
+ print("⚠️ Warning: GOOGLE_API_KEY is not set. Gemini LLM API may fail.")
27
+
28
+ if not NVIDIA_API_KEY:
29
+ print("⚠️ Warning: NVIDIA_API_KEY is not set. NVIDIA LLM API may fail.")
30
+
31
+ if not FIRE_CRAWL_API_KEY:
32
+ print("⚠️ Warning: FIRECRAWL_API_KEY is not set. FireCrawl API may fail.")
crawl4ai_client.py ADDED
File without changes
docker-compose.dev.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ semantic-search-app:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ container_name: llm-web-scrapper
9
+ ports:
10
+ - "12200:7860"
11
+ environment:
12
+ - NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env in local/dev environment
13
+ - GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env in local/dev environment
14
+ - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
15
+ volumes:
16
+ - .:/app:rw # This is for local development. Docker reads the code from the host machine. Changes on the host are reflected in the container.
17
+ restart: unless-stopped
docker-compose.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ semantic-search-app:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ container_name: llm-web-scrapper
9
+ ports:
10
+ - "12200:7860"
11
+ environment:
12
+ - NVIDIA_API_KEY=${NVIDIA_API_KEY} # Load this key from .env or manually add the secret
13
+ - GOOGLE_API_KEY=${GOOGLE_API_KEY} # Load this key from .env or manually add the secret
14
+ - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} # Load this key from .env in local/dev environment
15
+ restart: unless-stopped
16
+
firecrawl_client.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import FireCrawlLoader
2
+ from langchain_core.documents import Document
3
+ from config import FIRE_CRAWL_API_KEY
4
+
5
+
6
+ def scrape_with_firecrawl(url: str) -> list[Document]:
7
+
8
+ loader = FireCrawlLoader(url=url,
9
+ api_key=FIRE_CRAWL_API_KEY,
10
+ mode='scrape')
11
+
12
+ pages = []
13
+
14
+ for page in loader.lazy_load(): # type: ignore
15
+ pages.append(page)
16
+
17
+ return pages
18
+
19
+ def get_markdown_from_documents(docs: list[Document]) -> str:
20
+ markdown_content = ""
21
+ for i, doc in enumerate(docs):
22
+ markdown_content += f"### Page {i+1}\n"
23
+ markdown_content += f"{doc.page_content}\n\n--------------\n\n"
24
+ return markdown_content
25
+
26
+
27
+ def scrape_and_get_markdown_with_firecrawl(url: str) -> str:
28
+ docs = scrape_with_firecrawl(url)
29
+ markdown = get_markdown_from_documents(docs)
30
+ return markdown
31
+
32
+
33
+
34
+
llm_inference_service.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chat_models import init_chat_model
2
+
3
+
4
+ def extract_page_info_by_llm(user_query: str, scraped_markdown_content: str, model_name: str, model_provider: str) -> str:
5
+
6
+ if not scraped_markdown_content:
7
+ return "No relevant information found to answer your question."
8
+
9
+ context = scraped_markdown_content
10
+
11
+ prompt = f"""
12
+ You are an expert assistant who can extract useful information from the content provided to you. Most of the time,
13
+ the content will be product pages from e-commerce websites. Users will ask you to extract product information such as product name, price, rating, etc.
14
+
15
+ Please provide your identity (model name and provider if applicable) at the beginning of your answer.
16
+
17
+ Use the following context to answer the user's question. Provide the final answer in a markdown table format if you are asked to extract product information.
18
+ If you can't extract anything useful provide in plain markdown format.
19
+
20
+ If user asks for JSON format, please provide the answer in JSON format only.
21
+
22
+ If you do not find or know the answer, do not hallucinate, do not try to generate fake answers.
23
+ If no Context is given, simply state "No relevant information found to answer your question."
24
+
25
+ Context:
26
+ {context}
27
+
28
+ Question:
29
+ {user_query}
30
+
31
+ Your Identity:
32
+
33
+ Answer:
34
+
35
+ """
36
+
37
+ llm = init_chat_model(model_name, model_provider=model_provider)
38
+ response = llm.invoke(prompt)
39
+ return response.content
40
+
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gradio==5.46.1
2
+ # langchain==0.3.27
3
+ # langchain-community==0.3.29
4
+ # chromadb==1.1.0
5
+ # PyMuPDF==1.26.4
6
+ # langchain-google-genai==2.1.12
7
+ # langchain-nvidia-ai-endpoints==0.3.18
8
+ # dotenv==0.9.9
9
+
10
+
11
+
12
+ gradio
13
+ requests
14
+ # python-dotenv
15
+ dotenv
16
+
17
+ firecrawl-py
18
+ langchain-community
19
+ langchain-google-genai
20
+ langchain-nvidia-ai-endpoints
21
+ # groq