Spaces:

Agents-MCP-Hackathon
/

ai-image-video-mcp-server

Running

App Files Files Community

shohrukhdadakhon commited on Jun 10

Commit

4ad42b5

1 Parent(s): f8dc15a

'first'

Browse files

Files changed (3) hide show

.gitignore +14 -0
app.py +520 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Ignore Python cache
+__pycache__/
+*.pyc
+*.pyo
+# Ignore environment variable file
+.env
+# Ignore local virtual environments
+venv/
+env/
+# Ignore Hugging Face Space build artifacts
+gradio_cached_examples/

app.py ADDED Viewed

	@@ -0,0 +1,520 @@

+import gradio as gr
+from gradio_client import Client, handle_file
+from PIL import Image
+import io, base64, requests, os
+from dotenv import load_dotenv
+from google import genai
+from google.genai import types
+import time
+import mimetypes
+import tempfile
+from io import BytesIO
+load_dotenv()
+MODAL_KEY      = os.getenv("MODAL_LABS_KEY")
+MODAL_ENDPOINT = os.getenv("MODAL_LABS_ENDPOINT")
+GOOGLE_API_KEY = os.getenv("GEMINI_API")
+CLARITY_API    = "jbilcke-hf/clarity-upscaler"
+client = genai.Client(api_key=GOOGLE_API_KEY)
+# ── Function 1: Remove Background ─────────────────────────────── BIREFNET
+def remove_background_image(path: str, output_path: str = None) -> Image.Image:
+    """Edit a local image file using the 'Remove Background' method. Optionally save result.
+    Args:
+        path (str): Absolute or relative path to a PNG/JPEG on disk.
+        output_path (str, optional): Where to save the edited image, e.g. Downloads/bg_removed.png
+    Returns:
+        image: The edited image (PIL.Image) with background removed.
+    """
+    if not path or not os.path.exists(path):
+        raise gr.Error("Valid input image path is required.")
+    with Image.open(path).convert("RGB") as img:
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        img_b64 = base64.b64encode(buf.getvalue()).decode()
+    resp = requests.post(
+        MODAL_ENDPOINT,
+        json={"input_base64": img_b64, "model_type": "bg_removal"},
+        headers={"x-api-key": MODAL_KEY},
+        timeout=60
+    )
+    if resp.status_code != 200:
+        raise RuntimeError(f"Modal error: {resp.text}")
+    result_img = Image.open(io.BytesIO(base64.b64decode(resp.json()["output_base64"])))
+    if output_path:
+        result_img.save(output_path)
+    return result_img
+# ── Function 2: Clarity Upscaler ────────────────────────────────
+def upscale_image(
+    path: str,
+    output_path: str = None,
+    scale: float = 2,
+    dynamic: float = 6,
+    creativity: float = 0.35,
+    resemblance: float = 0.6,
+    tiling_width: str = "112",
+    tiling_height: str = "144",
+    model: str = "juggernaut_reborn.safetensors [338b85bc4f]",
+    scheduler: str = "DPM++ 3M SDE Karras",
+    steps: int = 18,
+    seed: int = 1337,
+    downscale: bool = False,
+    downscale_resolution: int = 768
+) -> Image.Image:
+    """Edit a local image using the 'Clarity Upscaler' method. Optionally save result. Useful for stylized upscaling with fractal detail control.
+    Args:
+        path (str): Absolute or relative path to a PNG/JPEG on disk.
+        output_path (str, optional): Path to save the edited image, e.g. Downloads/clarity_upscaled.png.
+        scale (float, optional): Upscale factor (default: 2).
+        dynamic (float, optional): Controls responsiveness of upscale. Range: 1–50 (default: 6).
+        creativity (float, optional): Controls creative generation. Range: 0.3–0.9 (default: 0.35).
+        resemblance (float, optional): How much result resembles original image. Range: 0.3–1.6 (default: 0.6).
+        tiling_width (str, optional): Tiling width for fractal detail (lower = more fractality). Options: 16–256 (default: "112").
+        tiling_height (str, optional): Tiling height for fractal detail (lower = more fractality). Options: 16–256 (default: "144").
+        model (str, optional): Base SD model. Options: juggernaut, epicrealism, flat2DAnimerge (default: juggernaut).
+        scheduler (str, optional): Sampling algorithm used. Options include DPM++, Euler, LMS, etc. (default: DPM++ 3M SDE Karras).
+        steps (int, optional): Number of inference steps. Range: 1–100 (default: 18).
+        seed (int, optional): Random seed. Default: 1337.
+        downscale (bool, optional): Whether to apply post-upscale downscaling. Default: False.
+        downscale_resolution (int, optional): Resolution to downscale to (if downscale=True). Default: 768.
+    Returns:
+        image: The edited image (PIL.Image) upscaled via AI model.
+    """
+    if not path or not os.path.exists(path):
+        raise gr.Error("Valid input image path is required.")
+    client = Client(CLARITY_API)
+    result_path = client.predict(
+        handle_file(path),
+        "", "",  # prompt / neg prompt
+        scale,
+        dynamic,
+        creativity,
+        resemblance,
+        tiling_width,
+        tiling_height,
+        model,
+        scheduler,
+        steps,
+        seed,
+        downscale,
+        downscale_resolution,
+        "", "",  # lora / custom model
+        api_name="/predict"
+    )
+    result_img = Image.open(result_path)
+    if output_path:
+        result_img.save(output_path)
+    return result_img
+# ── Function 3: Tile ControlNet Upscaler (Preferred) ────────────
+def upscale_image_preferred(
+    path: str,
+    output_path: str = None,
+    resolution: int = 512,
+    steps: int = 18,
+    strength: float = 0.4,
+    hdr: float = 0.1,
+    guidance: float = 3
+) -> Image.Image:
+    """Edit a local image file using the 'Tile Upscaler' method. This is the preferred upscale method. Optionally save the result.
+    Args:
+        path (str): Absolute or relative path to a PNG/JPEG on disk.
+        output_path (str, optional): Where to save the edited image, e.g., Downloads/upscaled_tile.png.
+        resolution (int, optional): Tile conditioning resolution before inference. Valid range: 256–2048. Default is 512.
+                                    This affects detail level. Output image is roughly 2x this resolution.
+                                    e.g. if 1024 is set, output is ~2048x2048.
+                                    Claude should decide based on image quality — for low-res input, try 1024.
+        steps (int, optional): Number of inference steps. Range: 1–50. Default is 18.
+        strength (float, optional): Strength of transformation (0–1). Default is 0.4.
+        hdr (float, optional): Intensity of HDR effect (0–1). Default is 0.1.
+        guidance (float, optional): Guidance scale (CFG). Range: 0–20. Default is 3.
+    Returns:
+        image: The upscaled image (PIL.Image) generated using ControlNet + RealESRGAN.
+    """
+    if not path or not os.path.exists(path):
+        raise gr.Error("Valid input image path is required.")
+    with Image.open(path).convert("RGB") as img:
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        img_b64 = base64.b64encode(buf.getvalue()).decode()
+    resp = requests.post(
+        MODAL_ENDPOINT,
+        json={
+            "input_base64": img_b64,
+            "model_type": "tile_upscale",
+            "resolution": resolution,
+            "steps": steps,
+            "strength": strength,
+            "hdr": hdr,
+            "guidance": guidance
+        },
+        headers={"x-api-key": MODAL_KEY},
+        timeout=300
+    )
+    if resp.status_code != 200:
+        raise RuntimeError(f"Modal error: {resp.text}")
+    result_img = Image.open(io.BytesIO(base64.b64decode(resp.json()["output_base64"])))
+    if output_path:
+        result_img.save(output_path)
+    return result_img
+def generate_video_from_image(
+    path: str,
+    prompt: str = "",
+    aspect_ratio: str = "16:9",
+    duration: int = 8,
+    output_path: str = None
+) -> str:
+    """
+    Generate a video from an image and a prompt using the Google Veo-2.0 model.
+    Args:
+        path (str): Path to input image on disk (JPG/PNG). This image will be used both as visual input for the video generation and as context for generating a descriptive prompt using the Veo prompt guide.
+        prompt (str): Prompt text to guide the generation. If generated dynamically, it should include subject, style, action, camera motion, composition, and ambiance where possible.
+        aspect_ratio (str): Desired aspect ratio, e.g., "16:9" or "9:16".
+        duration (int): Duration of the generated video in seconds. Valid range: 5–8.
+        output_path (str): Optional path to save the generated MP4 file locally.
+    Returns:
+        str: Path to the generated video file (temporary file used for Gradio display).
+    """
+    if not path or not os.path.exists(path):
+        raise gr.Error("Input image path is invalid or missing.")
+    with open(path, "rb") as f:
+        image_bytes = f.read()
+    # 2. Determine the MIME type from the file path
+    mime_type = mimetypes.guess_type(path)[0]
+    if not mime_type or not mime_type.startswith('image/'):
+        # Fallback for robustness, e.g., if mimetypes fails
+        if path.lower().endswith('.png'):
+            mime_type = 'image/png'
+        elif path.lower().endswith(('.jpg', '.jpeg')):
+            mime_type = 'image/jpeg'
+        else:
+            raise gr.Error(f"Could not determine image type for {path}. Please use JPG or PNG.")
+    # 3. Create the Image object with BOTH correct keywords
+    image_type = types.Image(image_bytes=image_bytes, mime_type=mime_type)
+    # --- End of corrected block ---
+    operation = client.models.generate_videos(
+        model="veo-2.0-generate-001",
+        prompt=prompt,
+        image=image_type,
+        config=types.GenerateVideosConfig(
+            person_generation="allow_adult",
+            aspect_ratio=aspect_ratio,
+            number_of_videos=1,
+            duration_seconds=duration,
+        )
+    )
+    print("Video generation started. Waiting for completion...")
+    while not operation.done:
+        time.sleep(20)
+        operation = client.operations.get(operation)
+        print("...")
+    # --- START OF CRUCIAL FIX ---
+    #
+    # !! CHECK IF THE OPERATION FAILED !!
+    # The 'response' attribute will be None if there was an error.
+    #
+    if not operation.response:
+        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        print("!! Video Generation FAILED.                 !!")
+        print("!! The operation finished but had no result.!!")
+        print("!! Printing the full operation object below.  !!")
+        print("!! Look for an 'error' field for the reason.  !!")
+        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        print(operation) # THIS IS THE MOST IMPORTANT LINE FOR DEBUGGING
+        raise gr.Error("Video generation failed. Check the server console for the detailed error from the API.")
+    # --- END OF CRUCIAL FIX ---
+    # If we get here, it means operation.response is valid.
+    print("Operation successful. Downloading video...")
+    video_data = operation.response.generated_videos[0].video
+    video_bytes = client.files.download(file=video_data)
+    if output_path:
+        with open(output_path, "wb") as out_file:
+            out_file.write(video_bytes)
+        print(f"Video saved to {output_path}")
+    # ✅ Always save to a Gradio-accessible temp file for UI display
+    temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    temp_file.write(video_bytes)
+    temp_file.close()
+    return temp_file.name
+def edit_image_with_gemini(
+    path: str,
+    prompt: str,
+    output_path: str = None
+) -> str:
+    """
+    Edits an image using Gemini 2.0 Flash Preview Image Generation by applying a prompt to a reference image.
+    This is typically used to generate a background scene behind a subject (e.g., a person or object with background removed),
+    in preparation for video generation. The prompt should clearly describe the desired environment or context **without altering
+    the subject itself**. For example: "Place this car in the desert of Mars, but do not change the car."
+    Args:
+        path (str): Path to the reference image (JPG/PNG), typically a background-removed subject.
+        prompt (str): Instruction describing the desired background or scene to add. Must explicitly state that the subject should remain unchanged.
+        output_path (str): Optional path to save the resulting image file.
+    Returns:
+        str: Path to the generated image (temporary file used for Gradio display or further processing).
+    """
+    if not path or not os.path.exists(path):
+        raise gr.Error("Input image path is invalid or missing.")
+    original_image = Image.open(path)
+    response = client.models.generate_content(
+        model="gemini-2.0-flash-preview-image-generation",
+        contents=[prompt, original_image],
+        config=types.GenerateContentConfig(
+            response_modalities=["TEXT", "IMAGE"]
+        )
+    )
+    # Parse response
+    image_data = None
+    for part in response.candidates[0].content.parts:
+        if part.inline_data is not None:
+            image_data = Image.open(BytesIO(part.inline_data.data))
+            break
+    if image_data is None:
+        raise gr.Error("No image was returned by Gemini.")
+    # Save to optional output path
+    if output_path:
+        image_data.save(output_path)
+        print(f"Image saved to {output_path}")
+    # Save to temp path for Gradio UI
+    temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    image_data.save(temp_file.name)
+    return temp_file.name
+# ── UI: Background Removal ──────────────────────────────────────
+remove_bg_ui = gr.Interface(
+    fn=remove_background_image,
+    inputs=[
+        gr.Textbox(label="Input Image Path", placeholder=r"C:\path\to\input.png"),
+        gr.Textbox(label="Optional Output Save Path", placeholder=r"C:\Users\shokh\Downloads\bg_removed.png"),
+    ],
+    outputs=gr.Image(type="pil", label="Result"),
+    title="Remove Background",
+)
+# ── UI: Clarity Upscaler ────────────────────────────────────────
+upscale_ui = gr.Interface(
+    fn=upscale_image,
+    inputs=[
+        gr.Textbox(label="Input Image Path", placeholder=r"C:\path\to\input.png"),
+        gr.Textbox(label="Optional Output Save Path", placeholder=r"C:\Users\shokh\Downloads\clarity_upscaled.png"),
+        gr.Slider(1, 4, step=0.1, value=2, label="Scale Factor"),
+        gr.Slider(1, 50, step=1, value=6, label="Dynamic"),
+        gr.Slider(0.3, 0.9, step=0.01, value=0.35, label="Creativity"),
+        gr.Slider(0.3, 1.6, step=0.01, value=0.6, label="Resemblance"),
+        gr.Dropdown(choices=[str(i) for i in range(16, 257, 16)], value="112", label="Tiling Width"),
+        gr.Dropdown(choices=[str(i) for i in range(16, 257, 16)], value="144", label="Tiling Height"),
+        gr.Dropdown(
+            choices=[
+                "juggernaut_reborn.safetensors [338b85bc4f]",
+                "epicrealism_naturalSinRC1VAE.safetensors [84d76a0328]",
+                "flat2DAnimerge_v45Sharp.safetensors"
+            ],
+            value="juggernaut_reborn.safetensors [338b85bc4f]",
+            label="Model"
+        ),
+        gr.Dropdown(
+            choices=[
+                "DPM++ 3M SDE Karras", "DPM++ 2M Karras", "Euler a", "Euler", "LMS", "Heun",
+                "DPM++ SDE", "DPM++ 2S a Karras", "DPM2", "UniPC", "DDIM", "PLMS"
+            ],
+            value="DPM++ 3M SDE Karras",
+            label="Scheduler"
+        ),
+        gr.Slider(1, 100, step=1, value=18, label="Inference Steps"),
+        gr.Number(value=1337, label="Seed"),
+        gr.Checkbox(label="Apply Downscaling", value=False),
+        gr.Number(value=768, label="Downscaling Resolution (if enabled)")
+    ],
+    outputs=gr.Image(type="pil", label="Result"),
+    title="Clarity Upscaler"
+)
+# ── UI: Tile Upscaler (Preferred) ───────────────────────────────
+tile_upscale_ui = gr.Interface(
+    fn=upscale_image_preferred,
+    inputs=[
+        gr.Textbox(label="Input Image Path", placeholder=r"C:\path\to\input.png"),
+        gr.Textbox(label="Optional Output Save Path", placeholder=r"C:\Users\shokh\Downloads\tile_upscaled.png"),
+        gr.Slider(256, 2048, step=64, value=512, label="Resolution"),
+        gr.Slider(1, 50, step=1, value=18, label="Inference Steps"),
+        gr.Slider(0, 1, step=0.01, value=0.4, label="Strength (0-1)"),
+        gr.Slider(0, 1, step=0.01, value=0.1, label="HDR Effect (0-1)"),
+        gr.Slider(0, 20, step=0.1, value=3, label="Guidance Scale (0-20)")
+    ],
+    outputs=gr.Image(type="pil", label="Result"),
+    title="Tile Upscaler (Preferred)"
+)
+generate_video_ui = gr.Interface(
+    fn=generate_video_from_image,
+    inputs=[
+        gr.Textbox(label="Image Path", placeholder="C:\\Users\\shokh\\Desktop\\img.png"),
+        gr.Textbox(label="Prompt", placeholder="A scenic view of mountains at sunset"),
+        gr.Dropdown(choices=["16:9", "9:16"], value="16:9", label="Aspect Ratio"),
+        gr.Slider(minimum=5, maximum=8, step=1, value=8, label="Duration (seconds)"),
+        gr.Textbox(label="Optional Output Save Path", placeholder="C:\\Users\\shokh\\Downloads\\video.mp4"),
+    ],
+    outputs=gr.Video(label="Generated Video"),
+    title="Image to Video",
+)
+generate_image_ui = gr.Interface(
+    fn=edit_image_with_gemini,
+    inputs=[
+        gr.Textbox(label="Image Path", placeholder="C:\\Users\\shokh\\Desktop\\no_bg_img.png"),
+        gr.Textbox(label="Prompt", placeholder="Place me in a futuristic cityscape at sunset"),
+        gr.Textbox(label="Optional Output Save Path", placeholder="C:\\Users\\shokh\\Downloads\\edited.png"),
+    ],
+    outputs=gr.Image(label="Edited Image"),
+    title="Edit Image with Gemini"
+)
+# Final UI with new tab added
+demo = gr.TabbedInterface(
+    interface_list=[
+        remove_bg_ui,
+        tile_upscale_ui,
+        upscale_ui,
+        generate_video_ui,
+        generate_image_ui  # <- Add here
+    ],
+    tab_names=[
+        "Remove Background",
+        "Upscale (Tile - Preferred)",
+        "Upscale (Clarity)",
+        "Image-to-Video",
+        "Edit Image with Gemini"  # <- And name the tab
+    ]
+)
+explanation_md = gr.Markdown(
+"""
+# 🧠 How This AI Image & Video Editing MCP Server Works
+This toolchain provides AI-powered image and video editing capabilities using multiple models connected via the [Claude MCP (Model Context Protocol)](https://modelcontextprotocol.io/) system. You can control and automate these tools from Claude Desktop.
+---
+### 🔧 Tools Available
+#### 1. **Remove Background**
+- **Model**: BiRefNet v2 (hosted on Modal Labs)
+- **Input**: Image with background
+- **Output**: Transparent PNG
+#### 2. **Upscale**
+- **Tile Upscaler**: Highly accurate enhancer using tiled upscaling (hosted on Modal Labs)
+- **Clarity Upscaler**: General quality enhancer (calls external Gradio Space API)
+#### 3. **Image-to-Video**
+- **Model**: Google Veo 2
+- **Input**: Image + Prompt
+- **Output**: Cinematic video clip (5–8 sec)
+- ⚠️ **Note**: Image must be visually coherent; typically used after background editing
+#### 4. **Edit Image with Gemini**
+- **Model**: Gemini 2.0 Flash Preview Image Generation
+- **Purpose**: Add backgrounds/scenes to background-removed subjects
+- ✅ **Important**: Prompt must specify to **not alter the subject**, only modify the environment.
+---
+### 🧑‍💻 How to Use With Claude Desktop (MCP)
+To use this space as an MCP server:
+1. **Download [Claude Desktop](https://claude.ai)**
+2. In Claude's MCP config, add this server and filesystem:
+```json
+{
+  "mcpServers": {
+    "gradio": {
+      "command": "npx",
+      "args": [
+        "mcp-remote",
+        "http://127.0.0.1:7860/gradio_api/mcp/sse"
+      ]
+    },
+    "filesystem": {
+      "command": "npx",
+      "args": [
+        "-y",
+        "@modelcontextprotocol/server-filesystem",
+        "C:\\Users\\YOUR_USERNAME\\Desktop\\claude-accessible-folder"
+      ]
+    }
+  }
+}
+```
+> 🗂️ Replace `YOUR_USERNAME` with your actual Windows username. Make sure the folder `claude-accessible-folder` exists on your Desktop. Claude will use it to share image/video files with the tools.
+---
+### 📺 Demo Video
+👉 [Watch how it works (Loom)](https://www.loom.com/share/90b7c72f4eda47e1a94ba6859b14d13e?sid=f268bb09-6a8d-4c83-8435-cf8f85085a93)
+---
+### 🧵 Built by: [@shdkhasan](https://x.com/shdkhasan)
+"""
+)
+with gr.Blocks() as full_ui:
+    demo.render()
+    explanation_md.render()
+full_ui.launch(mcp_server=True, show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+gradio_client
+Pillow
+requests
+python-dotenv
+google-generativeai