RAG-Scraper / app.py
CultriX's picture
feat: Overhaul WebUI, add PDF/Text export, use Poetry in Docker
17f9299
from __future__ import annotations
import os
os.environ['HF_HOME'] = '/tmp/hf_cache'
os.makedirs(os.environ['HF_HOME'], exist_ok=True) # Ensure the directory exists
import gradio as gr
import subprocess
import os
import re
import tempfile
import json
import csv
# Removed: from typing import Iterable # Added for Theme
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
# Removed: from gradio.themes.base import Base # Added for Theme
# Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
import markdown_pdf # Added for PDF conversion
# --- Custom Theme Definition --- (REMOVED Seafoam class and instance)
def is_github_repo(url_or_id):
"""Check if the input is a GitHub repository URL or ID."""
if "github.com" in url_or_id:
return True
if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
return True
return False
def check_repomix_installed():
"""Check if Repomix is installed."""
try:
result = subprocess.run(["repomix", "--version"],
capture_output=True, text=True, check=False)
return result.returncode == 0
except Exception:
return False
def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
"""Run Repomix on the GitHub repository and return the content."""
progress(0, desc="Starting Repomix processing...")
try:
with tempfile.TemporaryDirectory() as temp_dir:
output_file_name = "repomix-output.md"
output_file_path = os.path.join(temp_dir, output_file_name)
if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
repo_url = f"https://github.com/{repo_url_or_id}"
else:
repo_url = repo_url_or_id
progress(0.2, desc=f"Running Repomix on {repo_url}...")
cmd = [
"repomix",
"--remote", repo_url,
"--output", output_file_path,
"--style", "markdown",
"--compress"
]
process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
progress(0.8, desc="Repomix command executed.")
if process.returncode != 0:
error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
return f"Error running Repomix:\n{error_details}", None
if os.path.exists(output_file_path):
with open(output_file_path, 'r', encoding='utf-8') as f:
content = f.read()
progress(1, desc="Repomix output processed.")
return content, output_file_path
else:
error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
except Exception as e:
progress(1, desc="Error during Repomix processing.")
return f"Error processing GitHub repository: {str(e)}", None
def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)):
"""Fetch HTML, extract links, convert to Markdown."""
progress(0, desc=f"Starting web scrape for {url}...")
visited_urls = set()
all_markdown_content = ""
def recursive_scrape(current_url, current_depth, total_links_estimate=1, link_index=0):
if current_url in visited_urls or current_depth < 0:
return ""
visited_urls.add(current_url)
try:
progress_val = link_index / total_links_estimate if total_links_estimate > 0 else 0
progress(progress_val, desc=f"Scraping: {current_url} (Depth: {depth - current_depth})")
html_content = Scraper.fetch_html(current_url)
except Exception as e:
return f"Error fetching {current_url}: {str(e)}\n"
markdown_content = f"## Extracted from: {current_url}\n\n"
markdown_content += Converter.html_to_markdown(
html=html_content,
base_url=current_url,
parser_features='html.parser',
ignore_links=True
)
page_content = markdown_content + "\n\n"
if current_depth > 0:
try:
links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
valid_links = [
link for link in links
if URLUtils.is_internal(link, current_url) and link not in visited_urls
]
num_links = len(valid_links)
for i, link_url in enumerate(valid_links):
page_content += recursive_scrape(link_url, current_depth - 1, num_links, i)
except Exception as e:
page_content += f"Error extracting links from {current_url}: {str(e)}\n"
return page_content
all_markdown_content = recursive_scrape(url, depth)
progress(1, desc="Web scraping complete.")
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
tmp_file.write(all_markdown_content)
return all_markdown_content, tmp_file.name
def convert_to_json(markdown_content, source_url_or_id):
data = {"source": source_url_or_id, "content": markdown_content}
return json.dumps(data, indent=2)
def convert_to_csv(markdown_content, source_url_or_id):
output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
writer = csv.writer(output)
writer.writerow(["source", "content"])
writer.writerow([source_url_or_id, markdown_content])
output.close()
return output.name
def save_output_to_file(content, output_format, source_url_or_id):
"""Saves content to a temporary file based on format and returns its path."""
processed_content = content # Default for Markdown and Text
if output_format == "JSON":
suffix = ".json"
processed_content = convert_to_json(content, source_url_or_id)
elif output_format == "CSV":
# convert_to_csv returns a path directly
return convert_to_csv(content, source_url_or_id)
elif output_format == "Text":
suffix = ".txt"
elif output_format == "PDF":
suffix = ".pdf"
# PDF conversion happens differently, creates file directly
pdf_output_path = ""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
pdf_output_path = tmp_pdf_file.name
md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
md_pdf.convert_from_string(content, pdf_output_path)
return pdf_output_path
except Exception as e:
print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
suffix = ".pdf.md"
# No processed_content change needed, it's already markdown
else: # Default to Markdown
suffix = ".md"
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
tmp_file.write(processed_content)
return tmp_file.name
def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
progress(0, desc="Initializing...")
raw_content = ""
error_message = ""
output_file_path = None
if source_type == "GitHub Repository":
if not check_repomix_installed():
error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
return error_message, None, None
raw_content, _ = run_repomix(url_or_id, progress=progress)
if "Error" in raw_content:
error_message = raw_content
raw_content = ""
elif source_type == "Webpage":
raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
if "Error" in raw_content:
error_message = raw_content
raw_content = ""
else:
error_message = "Invalid source type selected."
return error_message, None, None
if error_message:
return error_message, None, None
try:
progress(0.9, desc=f"Converting to {output_format_selection}...")
output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
preview_content = raw_content
if output_format_selection == "JSON":
preview_content = convert_to_json(raw_content, url_or_id)
elif output_format_selection == "CSV" and output_file_path:
try:
with open(output_file_path, 'r', encoding='utf-8') as f_csv:
csv_preview_lines = [next(f_csv) for _ in range(5)]
preview_content = "".join(csv_preview_lines)
if not preview_content: preview_content = "[CSV content is empty or very short]"
except StopIteration:
with open(output_file_path, 'r', encoding='utf-8') as f_csv:
preview_content = f_csv.read()
if not preview_content: preview_content = "[CSV content is empty]"
except Exception as e_csv_preview:
preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
elif output_format_selection == "CSV" and not output_file_path:
preview_content = "[CSV file path not available for preview]"
elif output_format_selection == "PDF":
preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
if "Saving as Markdown instead" in (output_file_path or ""):
preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
progress(1, desc="Processing complete.")
return f"Successfully processed: {url_or_id}", preview_content, output_file_path
except Exception as e:
return f"Error during file conversion/saving: {str(e)}", raw_content, None
with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
gr.Markdown("# RAG-Ready Content Scraper")
gr.Markdown(
"Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
)
with gr.Row():
with gr.Column(scale=2):
url_input = gr.Textbox(
label="Enter URL or GitHub Repository ID",
placeholder="e.g., https://example.com OR username/repo"
)
source_type_input = gr.Radio(
choices=["Webpage", "GitHub Repository"],
value="Webpage",
label="Select Source Type"
)
depth_input = gr.Slider(
minimum=0, maximum=3, step=1, value=0,
label="Scraping Depth (for Webpages)",
info="0: Only main page. Ignored for GitHub repos."
)
output_format_input = gr.Dropdown(
choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
value="Markdown",
label="Select Output Format"
)
submit_button = gr.Button("Process Content", variant="primary")
with gr.Column(scale=3):
status_output = gr.Textbox(label="Status", interactive=False)
preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
file_download_output = gr.File(label="Download Processed File", interactive=False)
gr.Examples(
examples=[
["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
["gradio-app/gradio", "GitHub Repository", 0, "Text"],
["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
],
inputs=[url_input, source_type_input, depth_input, output_format_input],
outputs=[status_output, preview_output, file_download_output],
fn=process_input_updated,
cache_examples=False
)
with gr.Accordion("How it Works & More Info", open=False):
gr.Markdown(
"""
**Webpage Scraping:**
1. Enter a full URL (e.g., `https://example.com`).
2. Select "Webpage" as the source type.
3. Set the desired scraping depth.
4. Choose your output format.
**GitHub Repository Processing:**
1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
2. Select "GitHub Repository". (Depth is ignored).
3. Choose your output format. Uses **RepoMix**.
**Output Formats:** Markdown, JSON, CSV, Text, PDF.
**Note:** PDF generation requires `markdown-pdf` library.
This app is designed for Docker/HuggingFace Spaces.
[View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
"""
)
submit_button.click(
fn=process_input_updated,
inputs=[url_input, source_type_input, depth_input, output_format_input],
outputs=[status_output, preview_output, file_download_output],
)
if __name__ == "__main__":
iface.launch()