Spaces:
Sleeping
Sleeping
| """ | |
| SigLIP 2 Text & Image Encoder -- HuggingFace Space | |
| Encodes text or image queries to 768-dim vectors for the Epstein photo search. | |
| Model: google/siglip2-base-patch16-224 | |
| """ | |
| import gradio as gr | |
| import torch | |
| import torch.nn.functional as F | |
| from PIL import Image | |
| from transformers import AutoModel, AutoTokenizer, AutoProcessor | |
| MODEL_NAME = "google/siglip2-base-patch16-224" | |
| print(f"Loading {MODEL_NAME}...") | |
| model = AutoModel.from_pretrained(MODEL_NAME).eval() | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
| print(f"Model loaded. Text hidden size: {model.config.text_config.hidden_size}") | |
| def encode(text: str) -> list: | |
| inputs = tokenizer([text], return_tensors="pt", padding="max_length", max_length=64, truncation=True) | |
| with torch.no_grad(): | |
| feats = model.text_model(**inputs).pooler_output | |
| feats = F.normalize(feats, dim=-1) | |
| return feats[0].tolist() | |
| def encode_image(image) -> list: | |
| if image is None: | |
| raise gr.Error("No image provided") | |
| # Gradio 6.x base64 shortcut returns RGBA — SigLIP needs RGB | |
| if isinstance(image, Image.Image): | |
| image = image.convert("RGB") | |
| elif isinstance(image, str): | |
| image = Image.open(image).convert("RGB") | |
| else: | |
| raise gr.Error(f"Unexpected image type: {type(image)}") | |
| inputs = processor(images=[image], return_tensors="pt") | |
| with torch.no_grad(): | |
| feats = model.get_image_features(pixel_values=inputs["pixel_values"]) | |
| if not isinstance(feats, torch.Tensor): | |
| feats = feats.pooler_output | |
| feats = F.normalize(feats, dim=-1) | |
| return feats[0].tolist() | |
| with gr.Blocks(title="SigLIP 2 Encoder") as demo: | |
| gr.Markdown("# SigLIP 2 Encoder\nEncodes text or images to 768-dim normalized vectors using google/siglip2-base-patch16-224") | |
| with gr.Tab("Text"): | |
| text_input = gr.Textbox(label="Text") | |
| text_output = gr.JSON(label="Embedding (768-dim)") | |
| text_btn = gr.Button("Encode Text") | |
| text_btn.click(fn=encode, inputs=text_input, outputs=text_output, api_name="encode") | |
| with gr.Tab("Image"): | |
| image_input = gr.Image(type="pil", label="Image") | |
| image_output = gr.JSON(label="Embedding (768-dim)") | |
| image_btn = gr.Button("Encode Image") | |
| image_btn.click(fn=encode_image, inputs=image_input, outputs=image_output, api_name="encode_image") | |
| demo.launch() | |