Spaces:

MohamedRashad
/

Infinity

Running on Zero

App Files Files Community

MohamedRashad commited on Jan 6

Commit

eecb045

1 Parent(s): 366fd1c

Refactor load_tokenizer function to include error handling and device optimizations; streamline model loading process and improve memory management

Browse files

Files changed (1) hide show

app.py +42 -36

app.py CHANGED Viewed

@@ -57,7 +57,7 @@ def encode_prompt(text_tokenizer, text_encoder, prompt):
     text_features = text_encoder(input_ids=input_ids, attention_mask=mask)['last_hidden_state'].float()
     lens: List[int] = mask.sum(dim=-1).tolist()
     cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0))
-    Ltext = max(lens)
     kv_compact = []
     for len_i, feat_i in zip(lens, text_features.unbind(0)):
         kv_compact.append(feat_i[:len_i])
@@ -77,15 +77,40 @@ def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_
     print('[Save slim model] done')
     return save_file
-def load_tokenizer(t5_path=''):
-    print('[Loading tokenizer and text encoder]')
-    tokenizer = AutoTokenizer.from_pretrained(t5_path, legacy=True)
-    tokenizer.model_max_length = 512
-    encoder = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
-    encoder.eval()
-    encoder.to("cuda" if torch.cuda.is_available() else "cpu")
-    encoder.requires_grad_(False)
-    return tokenizer, encoder
 def load_infinity(
     rope2d_each_sa_layer,
@@ -154,8 +179,8 @@ def load_infinity(
         state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
-        # Initialize random number generator on the correct device
-        infinity_test.rng = torch.Generator(device=device)
     return infinity_test
@@ -315,29 +340,7 @@ def load_transformer(vae, args):
     model_path = args.model_path
     if args.checkpoint_type == 'torch':
-        if osp.exists(args.cache_dir):
-            local_model_path = osp.join(args.cache_dir, 'tmp', model_path.replace('/', '_'))
-        else:
-            local_model_path = model_path
-        if args.enable_model_cache:
-            slim_model_path = model_path.replace('ar-', 'slim-')
-            local_slim_model_path = local_model_path.replace('ar-', 'slim-')
-            os.makedirs(osp.dirname(local_slim_model_path), exist_ok=True)
-            if not osp.exists(local_slim_model_path):
-                if osp.exists(slim_model_path):
-                    shutil.copyfile(slim_model_path, local_slim_model_path)
-                else:
-                    if not osp.exists(local_model_path):
-                        shutil.copyfile(model_path, local_model_path)
-                    save_slim_model(local_model_path, save_file=local_slim_model_path, device=device)
-                    if not osp.exists(slim_model_path):
-                        shutil.copyfile(local_slim_model_path, slim_model_path)
-                        os.remove(local_model_path)
-                        os.remove(model_path)
-            slim_model_path = local_slim_model_path
-        else:
-            slim_model_path = model_path
         print(f'Loading checkpoint from {slim_model_path}')
     else:
         raise ValueError(f"Unsupported checkpoint_type: {args.checkpoint_type}")
@@ -465,10 +468,13 @@ args = argparse.Namespace(
 )
 # Load models
 text_tokenizer, text_encoder = load_tokenizer(t5_path="google/flan-t5-xl")
 vae = load_visual_tokenizer(args)
 infinity = load_transformer(vae, args)
 # Define the image generation function
 @spaces.GPU

     text_features = text_encoder(input_ids=input_ids, attention_mask=mask)['last_hidden_state'].float()
     lens: List[int] = mask.sum(dim=-1).tolist()
     cu_seqlens_k = F.pad(mask.sum(dim=-1).to(dtype=torch.int32).cumsum_(0), (1, 0))
+    Ltext = max(lens)
     kv_compact = []
     for len_i, feat_i in zip(lens, text_features.unbind(0)):
         kv_compact.append(feat_i[:len_i])
     print('[Save slim model] done')
     return save_file
+def load_tokenizer(t5_path='google/flan-t5-xl'):
+    """
+    Load and configure the T5 tokenizer and encoder with optimizations.
+    """
+    try:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        bf16_supported = device.type == 'cuda' and torch.cuda.is_bf16_supported()
+        dtype = torch.bfloat16 if bf16_supported else torch.float32
+        tokenizer = AutoTokenizer.from_pretrained(
+            t5_path,
+            legacy=True,
+            model_max_length=512,
+            use_fast=True,
+        )
+        if device.type == 'cuda':
+            torch.cuda.empty_cache()
+        encoder = T5EncoderModel.from_pretrained(
+            t5_path,
+            torch_dtype=dtype,
+        )
+        encoder.eval().requires_grad_(False).to(device)
+        if device.type == 'cuda' and not bf16_supported:
+            encoder.half()
+        return tokenizer, encoder
+    except Exception as e:
+        print(f"Error loading tokenizer/encoder: {str(e)}")
+        raise RuntimeError("Failed to initialize text models") from e
 def load_infinity(
     rope2d_each_sa_layer,
         state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
+        # # Initialize random number generator on the correct device
+        # infinity_test.rng = torch.Generator(device=device)
     return infinity_test
     model_path = args.model_path
     if args.checkpoint_type == 'torch':
+        slim_model_path = model_path
         print(f'Loading checkpoint from {slim_model_path}')
     else:
         raise ValueError(f"Unsupported checkpoint_type: {args.checkpoint_type}")
 )
 # Load models
+print(f"VRAM before forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 text_tokenizer, text_encoder = load_tokenizer(t5_path="google/flan-t5-xl")
+print(f"VRAM before forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 vae = load_visual_tokenizer(args)
+print(f"VRAM before forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 infinity = load_transformer(vae, args)
+print(f"VRAM before forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 # Define the image generation function
 @spaces.GPU