Spaces:

medmac01
/

stable-diff-multilingual-v0.1

Sleeping

medmac01 commited on Mar 7, 2024

Commit

9b0b32e

verified ·

1 Parent(s): ff09d94

Upload 6 files

Files changed (2) hide show

model.py CHANGED Viewed

@@ -1,8 +1,4 @@
-import os
-from PIL import Image, ImageDraw
-import cv2
-import numpy as np
-from IPython.display import HTML
 from base64 import b64encode
 import torch
@@ -14,8 +10,7 @@ from diffusers.schedulers.scheduling_ddim import DDIMScheduler
 #from transformers import CLIPTextModel, CLIPTokenizer
 from tqdm.auto import tqdm
 from huggingface_hub import notebook_login
-import weights
 device = 'cpu'
@@ -47,9 +42,6 @@ class MultilingualCLIP(transformers.PreTrainedModel):
         return model, [], [], []
-import torch
-import torch.nn as nn
 # Define the adaptation layer, 'checkpoint_9.pth'
 class AdaptationLayer(nn.Module):
   def __init__(self, input_dim, output_dim):
@@ -87,6 +79,20 @@ adapt_model.to(device)
 state_dict = torch.load('weights/checkpoint_9.pth')
 adapt_model.load_state_dict(state_dict)
 # 1. Load the autoencoder model which will be used to decode the latents into image space.
 vae = AutoencoderKL.from_pretrained(
     'CompVis/stable-diffusion-v1-4', subfolder='vae', use_auth_token=True)

+from PIL import Image
 from base64 import b64encode
 import torch
 #from transformers import CLIPTextModel, CLIPTokenizer
 from tqdm.auto import tqdm
 from huggingface_hub import notebook_login
+import torch.nn as nn
 device = 'cpu'
         return model, [], [], []
 # Define the adaptation layer, 'checkpoint_9.pth'
 class AdaptationLayer(nn.Module):
   def __init__(self, input_dim, output_dim):
 state_dict = torch.load('weights/checkpoint_9.pth')
 adapt_model.load_state_dict(state_dict)
+from Multilingual_CLIP.multilingual_clip import pt_multilingual_clip
+texts = [
+    'قطة تقرأ كتابا'
+]
+model_name = 'M-CLIP/LABSE-Vit-L-14'
+# Load Model & Tokenizer
+text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
+text_model = text_model.to(device)
+text_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+embeddings= text_model.forward(texts, text_tokenizer, device )
 # 1. Load the autoencoder model which will be used to decode the latents into image space.
 vae = AutoencoderKL.from_pretrained(
     'CompVis/stable-diffusion-v1-4', subfolder='vae', use_auth_token=True)

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ transformers
 diffusers
 torch
 accelerate
-gradio

 diffusers
 torch
 accelerate
+gradio
+opencv-python-headless
+tqdm