Spaces:

bczhou
/

clip-gpt2

Sleeping

App Files Files Community

bczhou commited on Aug 15, 2023

Commit

5f3b360

•

1 Parent(s): 3738ba5

Rename linear_mapping.py to clip_gpt2.py

Browse files

Files changed (1) hide show

linear_mapping.py → clip_gpt2.py +17 -35

linear_mapping.py → clip_gpt2.py RENAMED Viewed

@@ -1,41 +1,25 @@
-from config import LinearMappingConfig
 from transformers import (
-    GPT2TokenizerFast, GPT2LMHeadModel, AutoModel,
-    CLIPVisionModel, AutoProcessor, BatchEncoding,
     AutoConfig, CLIPVisionConfig
 )
 from transformers.models.gpt2.modeling_gpt2 import GPT2DoubleHeadsModelOutput
 import torch
 import torch.nn as nn
 from typing import List, Optional, Union, Tuple, Dict
-from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
-from torchvision.transforms.functional import InterpolationMode
-class Transform(torch.nn.Module):
-    def __init__(self, image_size, mean, std):
-        super().__init__()
-        self.transforms = torch.nn.Sequential(
-            Resize([image_size], interpolation=InterpolationMode.BICUBIC, antialias=True),
-            CenterCrop(image_size),
-            ConvertImageDtype(torch.float32),
-            Normalize(mean, std),
-        )
-    def forward(self, x) -> torch.Tensor:
-        """`x` should be an instance of `PIL.Image.Image`"""
-        with torch.no_grad():
-            x = self.transforms(x)
-        return x
-class LinearMappingProcessor:
     """
-    A combination of ImageProcessor and GPT2TokenizerFast
     """
-    def __init__(self, config: LinearMappingConfig):
-        self.image_processor = AutoProcessor.from_pretrained(config.image_model)
         self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
         self.add_image_token = config.add_image_token
         if config.add_image_token:
@@ -103,7 +87,7 @@ class ImagePrefix(nn.Module):
     Converts pixel values to prefix image prompts that are later fed to a LLM
     """
-    def __init__(self, config: LinearMappingConfig):
         super().__init__()
         clip_config = CLIPVisionConfig.from_pretrained(config.image_model)
@@ -126,21 +110,16 @@ class ImagePrefix(nn.Module):
         return self.ln(prefix_prompts)
-class LinearMapping(nn.Module):
-    def __init__(self, config: LinearMappingConfig):
         super().__init__()
         self.image_prefix = ImagePrefix(config)
         self.language_model = GPT2LMHeadModel(AutoConfig.from_pretrained(config.text_model))
         if config.text_from_pretrained:
             self.language_model = self.language_model.from_pretrained(config.text_model)
-        self.processor = LinearMappingProcessor(config)
-        self.tokenizer = self.processor.tokenizer
-        self.image_processor = self.processor.image_processor
-        self.add_image_token = config.add_image_token
-        if config.add_image_token:
-            self.language_model.resize_token_embeddings(len(self.tokenizer))
         if config.freeze_text_model:
             for module in self.language_model.modules():
                 if not isinstance(module, nn.LayerNorm) or config.freeze_ln:
@@ -179,7 +158,7 @@ class LinearMapping(nn.Module):
             for label in labels:
                 for k, token in enumerate(label):
-                    if token == self.tokenizer.eos_token_id:
                         label[k + 1:] = -100
                         break
             return {"hidden_states": inputs_embeddings, "labels": labels.to(dtype=torch.int64)}
@@ -208,6 +187,8 @@ class LinearMapping(nn.Module):
             pixel_values: Optional[torch.Tensor] = None,
             **kwargs
     ):
         if pixel_values is None:
             return self.language_model.generate(
                 input_ids=input_ids,
@@ -249,6 +230,7 @@ class LinearMapping(nn.Module):
         )
         if past_input_ids is not None:
             generated_token_ids = torch.cat([past_input_ids, generated_token_ids], dim=-1)
         return generated_token_ids
     def forward(

+from config import CLIPGPT2Config
 from transformers import (
+    GPT2TokenizerFast, GPT2LMHeadModel,
+    CLIPVisionModel, BatchEncoding,
+    CLIPImageProcessor,
     AutoConfig, CLIPVisionConfig
 )
 from transformers.models.gpt2.modeling_gpt2 import GPT2DoubleHeadsModelOutput
 import torch
 import torch.nn as nn
 from typing import List, Optional, Union, Tuple, Dict
+EOS_TOKEN_ID = 50256
+class CLIPGPT2Processor:
     """
+    A combination of CLIP ImageProcessor and GPT2TokenizerFast
     """
+    def __init__(self, config: CLIPGPT2Config):
+        self.image_processor = CLIPImageProcessor.from_pretrained(config.image_model)
         self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
         self.add_image_token = config.add_image_token
         if config.add_image_token:
     Converts pixel values to prefix image prompts that are later fed to a LLM
     """
+    def __init__(self, config: CLIPGPT2Config):
         super().__init__()
         clip_config = CLIPVisionConfig.from_pretrained(config.image_model)
         return self.ln(prefix_prompts)
+class CLIPGPT2(nn.Module):
+    def __init__(self, config: CLIPGPT2Config):
         super().__init__()
         self.image_prefix = ImagePrefix(config)
         self.language_model = GPT2LMHeadModel(AutoConfig.from_pretrained(config.text_model))
         if config.text_from_pretrained:
             self.language_model = self.language_model.from_pretrained(config.text_model)
+        self.language_model.resize_token_embeddings(config.vocab_size)
         if config.freeze_text_model:
             for module in self.language_model.modules():
                 if not isinstance(module, nn.LayerNorm) or config.freeze_ln:
             for label in labels:
                 for k, token in enumerate(label):
+                    if token == EOS_TOKEN_ID:
                         label[k + 1:] = -100
                         break
             return {"hidden_states": inputs_embeddings, "labels": labels.to(dtype=torch.int64)}
             pixel_values: Optional[torch.Tensor] = None,
             **kwargs
     ):
+        in_training = self.training
+        self.eval()
         if pixel_values is None:
             return self.language_model.generate(
                 input_ids=input_ids,
         )
         if past_input_ids is not None:
             generated_token_ids = torch.cat([past_input_ids, generated_token_ids], dim=-1)
+        self.train(in_training)
         return generated_token_ids
     def forward(