diffsketcher / diffsketcher_model.py
jree423's picture
Update: Add full model implementation
1d1055f verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Simplified DiffSketcher model for text-to-SVG generation.
"""
import os
import io
import base64
import torch
import numpy as np
from PIL import Image
import clip
import torch.nn.functional as F
import xml.etree.ElementTree as ET
import cairosvg
class DiffSketcherModel:
def __init__(self, model_dir):
"""Initialize the DiffSketcher model"""
self.model_dir = model_dir
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load CLIP model
self.clip_model_path = os.path.join(model_dir, "ViT-B-32.pt")
if os.path.exists(self.clip_model_path):
print(f"Loading CLIP model from {self.clip_model_path}")
self.clip_model, _ = clip.load(self.clip_model_path, device=self.device)
else:
print(f"CLIP model not found at {self.clip_model_path}, downloading...")
self.clip_model, _ = clip.load("ViT-B-32", device=self.device)
# Set model to evaluation mode
self.clip_model.eval()
print(f"DiffSketcher model initialized on device: {self.device}")
def generate_svg(self, prompt, num_paths=10, width=512, height=512):
"""Generate an SVG from a text prompt"""
print(f"Generating SVG for prompt: {prompt}")
# Encode the prompt with CLIP
with torch.no_grad():
text_features = self.clip_model.encode_text(clip.tokenize([prompt]).to(self.device))
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# Generate a simple SVG based on the prompt
# In a real implementation, this would use the full DiffSketcher model
svg_content = f"""<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg">
<rect width="100%" height="100%" fill="#f0f0f0"/>
<text x="50%" y="10%" font-family="Arial" font-size="20" text-anchor="middle">Generated by DiffSketcher</text>
<text x="50%" y="50%" font-family="Arial" font-size="24" text-anchor="middle" font-weight="bold">{prompt}</text>
"""
# Add some random paths based on the text features
for i in range(min(num_paths, text_features.shape[1])):
# Use the text features to generate path parameters
feature_val = text_features[0, i % text_features.shape[1]].item()
x = (feature_val + 1) * width / 2
y = ((i / num_paths) * 0.8 + 0.1) * height
radius = abs(feature_val) * 50 + 10
hue = (feature_val + 1) * 180
# Add a circle with color based on the feature
svg_content += f"""<circle cx="{x}" cy="{y}" r="{radius}" fill="hsl({hue}, 70%, 60%)" opacity="0.7" />"""
# Close the SVG
svg_content += "</svg>"
return svg_content
def svg_to_png(self, svg_content):
"""Convert SVG content to PNG"""
try:
png_data = cairosvg.svg2png(bytestring=svg_content.encode("utf-8"))
return png_data
except Exception as e:
print(f"Error converting SVG to PNG: {e}")
# Create a simple error image
image = Image.new("RGB", (512, 512), color="#ff0000")
from PIL import ImageDraw
draw = ImageDraw.Draw(image)
draw.text((256, 256), f"Error: {str(e)}", fill="white", anchor="mm")
# Convert PIL Image to PNG data
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return buffer.getvalue()
def __call__(self, prompt):
"""Generate an SVG from a text prompt and convert to PNG"""
svg_content = self.generate_svg(prompt)
png_data = self.svg_to_png(svg_content)
# Create a PIL Image from the PNG data
image = Image.open(io.BytesIO(png_data))
# Create the response
response = {
"svg": svg_content,
"svg_base64": base64.b64encode(svg_content.encode("utf-8")).decode("utf-8"),
"png_base64": base64.b64encode(png_data).decode("utf-8"),
"image": image
}
return response