Spaces:

Tharunika1601
/

text2speech

Sleeping

File size: 1,625 Bytes

0e39422
1feb738
0e39422
 
414dca8
 
adb4a2a
1feb738
adb4a2a
0e39422
 
 
adb4a2a
0e39422
 
d312027
0e39422
adb4a2a
414dca8
 
 
 
d312027
 
 
 
 
 
 
 
 
 
 
0710d1c
1feb738
d312027
1feb738
adb4a2a
0e39422

import streamlit as st
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import requests
from io import BytesIO

st.title("Text to Image Generation with CLIP")

# Load pretrained models
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

text = st.text_area("Enter a description:")
if st.button("Generate Image") and text:
    # Process text and get CLIP features for text
    text_features = clip_processor(text, return_tensors="pt", padding=True)

    # Load an example image from the web (replace this with your image loading logic)
    example_image_url = "https://example.com/your-image.jpg"
    example_image_response = requests.get(example_image_url)
    example_image = Image.open(BytesIO(example_image_response.content))

    # Process image and get CLIP features for image
    image_features = clip_processor(images=example_image, return_tensors="pt", padding=True)

    # Concatenate text and image features
    combined_features = {
        "pixel_values": torch.cat([text_features.pixel_values, image_features.pixel_values], dim=-1)
    }

    # Forward pass through CLIP
    image_representation = clip_model(**combined_features).last_hidden_state.mean(dim=1)

    # For visualization, you can convert the image representation back to an image
    image_array = image_representation.squeeze().cpu().numpy()
    image = Image.fromarray((image_array * 255).astype('uint8'))

    # Display the generated image
    st.image(image, caption="Generated Image", use_column_width=True)