Spaces:

Tharunika1601
/

text2speech

Sleeping

Tharunika1601 commited on Jan 25

Commit

d312027

•

1 Parent(s): 0710d1c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,14 +11,26 @@ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
 text = st.text_area("Enter a description:")
 if st.button("Generate Image") and text:
-    # Process text and get CLIP features
     text_features = clip_processor(text, return_tensors="pt", padding=True)
-    # Use CLIP's encode_image method to obtain the image features
-    image_representation = clip_model.encode_image(text_features.pixel_values)
     # For visualization, you can convert the image representation back to an image
-    image_array = image_representation.squeeze().permute(1, 2, 0).cpu().numpy()
     image = Image.fromarray((image_array * 255).astype('uint8'))
     # Display the generated image

 text = st.text_area("Enter a description:")
 if st.button("Generate Image") and text:
+    # Process text and get CLIP features for text
     text_features = clip_processor(text, return_tensors="pt", padding=True)
+    # Load an example image (replace this with your image loading logic)
+    example_image_path = "path/to/your/image.jpg"
+    example_image = Image.open(example_image_path)
+    # Process image and get CLIP features for image
+    image_features = clip_processor(images=example_image, return_tensors="pt", padding=True)
+    # Concatenate text and image features
+    combined_features = {
+        "pixel_values": torch.cat([text_features.pixel_values, image_features.pixel_values], dim=-1)
+    }
+    # Forward pass through CLIP
+    image_representation = clip_model(**combined_features).last_hidden_state.mean(dim=1)
     # For visualization, you can convert the image representation back to an image
+    image_array = image_representation.squeeze().cpu().numpy()
     image = Image.fromarray((image_array * 255).astype('uint8'))
     # Display the generated image