Spaces:

danube2024
/

text-to-image-depth-map

Running

App Files Files Community

danube2024 commited on Jan 21

Commit

dff845d

verified ·

1 Parent(s): 75b29d0

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -14

app.py CHANGED Viewed

@@ -1,23 +1,62 @@
 import gradio as gr
-from diffusers import StableDiffusionPipeline, StableDiffusionDepth2ImgPipeline
 from PIL import Image
-# Initialize pipelines
-sd_pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
-sd_pipeline.to("cuda")
-depth_pipeline = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-depth2img")
-depth_pipeline.to("cuda")
-def generate_images(prompt):
-    base_image = sd_pipeline(prompt).images[0]
-    depth_image = depth_pipeline(prompt=prompt, image=base_image).images[0]
-    return base_image, depth_image
 iface = gr.Interface(
-    fn=generate_images,
-    inputs=gr.Textbox(label="Prompt"),
-    outputs=[gr.Image(label="Bas-relief"), gr.Image(label="Depth Map")],
-    title="Text to Bas-Relief and Depth Map",
 )
 iface.launch()

 import gradio as gr
+from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+from diffusers import StableDiffusionPipeline
+import torch
+import numpy as np
 from PIL import Image
+import open3d as o3d
+from pathlib import Path
+# Initialize the models for CPU environment
+device = "cpu"
+torch_dtype = torch.float32
+# Use a lighter text-to-image model optimized for CPU
+text_to_image_pipeline = StableDiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-base",
+    torch_dtype=torch_dtype
+)
+text_to_image_pipeline.to(device)
+# Load depth estimation models
+feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
+depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+def generate_3d_from_text(prompt):
+    # Step 1: Generate Image from Text Prompt
+    generated_image = text_to_image_pipeline(prompt).images[0]
+    # Step 2: Estimate Depth from Generated Image
+    encoding = feature_extractor(generated_image, return_tensors="pt")
+    with torch.no_grad():
+        outputs = depth_model(**encoding)
+        predicted_depth = outputs.predicted_depth
+    # Resize depth map to original image size
+    prediction = torch.nn.functional.interpolate(
+        predicted_depth.unsqueeze(1),
+        size=generated_image.size[::-1],
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze()
+    depth_image = (prediction.cpu().numpy() * 255 / np.max(prediction.cpu().numpy())).astype("uint8")
+    depth_image_pil = Image.fromarray(depth_image)
+    return generated_image, depth_image_pil
+# Gradio Interface
+title = "3D Model Generation from Text (CPU-friendly)"
+description = "Generate a 3D model from a text description using a lightweight text-to-image and depth estimation."
 iface = gr.Interface(
+    fn=generate_3d_from_text,
+    inputs=gr.Textbox(label="Enter text description", placeholder="Describe your scene (e.g., 'A Roman soldier in armor')"),
+    outputs=[
+        gr.Image(label="Generated Image"),
+        gr.Image(label="Depth Map")
+    ],
+    title=title,
+    description=description,
 )
 iface.launch()