Custom architectures with HuggingFace 🤗

Community blog post
Published February 4, 2024

Open In Colab

Baseline

In this section we will create a baseline model and train it, in our example we will train a simple CNN model against the MNIST dataset.

import torch
from torch import nn, optim
import torchvision
from torchvision import datasets, transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader

train_dataset = datasets.MNIST(root='./data', train=True, download=True,transform=transforms.ToTensor())

# Define batch size and number of workers (if any) for data loading
batch_size = 64
num_workers = 2

# Create a DataLoader for the training dataset with specified batch size and number of workers
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

then define our model

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        output = self.softmax(x)

        return output

then train our model and save the weights

model = Net()
criterion = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data[0], data[1]
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 20 == 19:    # print every 20 mini-batches
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.3f' %
                  (epoch + 1, epochs, i + 1, len(train_dataloader),running_loss / 20))
            running_loss = 0.0


# Save the entire model and other necessary information
checkpoint = {
    'state_dict': model.state_dict(),
}
# Specify the file path where you want to save the model
torch.save(checkpoint, 'model.pth')

custom model

to create a custom architecture that is 🤗 friendly, we need 3 files

  1. MyConfig.py : file defining the architecture
  2. MyModel.py : file defining the model architecture
  3. MyPipe.py : file defining the pipeline

Each of these files will have to be defined outside of the main python interpreter. The reason we are doing this is that this will automatically upload our dependencies and custom architecture automatically

config

The config file is a file that stores information about the architecture and it is used to instantiate the model. In my case I chose to store only 2 parameters, which are the parameters for the conv1 and conv2 layers, you can choose to add more parameters of your choosing.

from transformers import PretrainedConfig

class MnistConfig(PretrainedConfig):
    # since we have an image classification task
    # we need to put a model type that is close to our task
    # don't worry this will not affect our model
    model_type = "MobileNetV1"
    def __init__(
        self,
        conv1=10,
        conv2=20,
        **kwargs):
      self.conv1 = conv1
      self.conv2 = conv2
      super().__init__(**kwargs)
.
├── MyFolder
│   ├── __init__.py
│   └── MyConFig.py
└── model.pth

model

For the model we need to inherit from the PreTrainedModel class and pass the previously defined configuration above into the config_class. Do not forget to instantiate the model using the config parameter.

from transformers import PreTrainedModel
from .MyConfig import MnistConfig # local import
from torch import nn
import torch.nn.functional as F

class MnistModel(PreTrainedModel):
    # pass the previously defined config class to the model
    config_class = MnistConfig

    def __init__(self, config):
        # instantiate the model using the configuration
        super().__init__(config)
        # use the config to instantiate our model
        self.conv1 = nn.Conv2d(1, config.conv1, kernel_size=5)
        self.conv2 = nn.Conv2d(config.conv1, config.conv2, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, x,labels=None):
        # the labels parameter allows us to finetune our model
        # with the Trainer API easily
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        output = self.softmax(x)
        if labels != None :
          print("continue training script here")

        return output
.
├── MyFolder
│   ├── __init__.py
│   ├── MyConFig.py
│   └── MyModel.py
└── model.pth

push to the hub 🤗

First we need to login using a TOKEN with writing access

from huggingface_hub import notebook_login
notebook_login()

then load the model and register it for the auto class

from MyFolder.MyConfig import MnistConfig
from MyFolder.MyModel import MnistModel
import torch

conf = MnistConfig()
HF_Model = MnistModel(conf) # instantiate the model using the config

# load the weights
weights = torch.load("model.pth")
HF_Model.load_state_dict(weights['state_dict'])

conf.register_for_auto_class()
HF_Model.register_for_auto_class("AutoModelForImageClassification")

finally push our configuration and our model to the hub 🤗

conf.push_to_hub('MyRepo')
HF_Model.push_to_hub('MyRepo')

By now your model should be available in your own repo for you to use.

custom pipeline

understanding the workflow

let's call our previously defined model and use it to classify a new image

from transformers import AutoModelForImageClassification
model = AutoModelForImageClassification.from_pretrained("not-lain/MyRepo", trust_remote_code=True)

# download an image from the web
import requests
url = "https://cdn.discordapp.com/attachments/753001408663650447/1203351253938216980/8599BQAAAAASUVORK5CYII.png" 
response = requests.get(url, stream=True)
response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

# Open a local file to save the image
with open("image.png", "wb") as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)
print("image saved as image.png")

# load and process the image
from PIL import Image
import torchvision.transforms as transforms
import torch
img = Image.open("image.png") # read image
gray = img.convert('L') # convert to grayscale if needed
print(gray.size) # get image dimensions
# >> (1490, 1480)
# process input
transform = transforms.Compose(
    [transforms.ToTensor(), # convert to a torch tensor
     transforms.Resize((28,28), antialias=True) # resize img
     ])
tensor = transform(gray) # apply to input
tensor = tensor.unsqueeze(0) # add extra dimensionality, think batch_size = 1
with torch.no_grad():
  out = model(tensor) # calculate the output
label = torch.argmax(out,axis=-1) # get class
print(label.tolist()[0]) # extract the label
# >> 7

creating the pipeline

Let's automate this with our custom pipeline and create something a little bit more complex to cover most use cases:

from transformers import Pipeline
import requests
from PIL import Image
import torchvision.transforms as transforms
import torch

class MnistPipe(Pipeline):
    def __init__(self,**kwargs):

      # self.tokenizer = (...) # code if you want to instantiate more parameters

      Pipeline.__init__(self,**kwargs) # self.model automatically instantiated here

      self.transform = transforms.Compose(
                              [transforms.ToTensor(),
                              transforms.Resize((28,28), antialias=True)
                              ])

    def _sanitize_parameters(self, **kwargs):
        # will make sure where each parameter goes
        preprocess_kwargs = {}
        postprocess_kwargs = {}
        if "download" in kwargs:
            preprocess_kwargs["download"] = kwargs["download"]
        if "clean_output" in kwargs :
          postprocess_kwargs["clean_output"] = kwargs["clean_output"]
        return preprocess_kwargs, {}, postprocess_kwargs

    def preprocess(self, inputs, download=False):
        if download == True :
          # call download_img method and name image as "image.png"
          self.download_img(inputs)
          inputs = "image.png"

        # we open and process the image
        img = Image.open(inputs)
        gray = img.convert('L')
        tensor = self.transform(gray)
        tensor = tensor.unsqueeze(0)
        return tensor

    def _forward(self, tensor):
        with torch.no_grad():
            # the model has been automatically instantiated
            # in the __init__ method
            out = self.model(tensor)
        return out

    def postprocess(self, out, clean_output=True):
        if clean_output ==True :
          label = torch.argmax(out,axis=-1) # get class
          label = label.tolist()[0]
          return label
        else :
          return out

    def download_img(self,url):
      # if download = True download image and name it image.png
      response = requests.get(url, stream=True)

      with open("image.png", "wb") as f:
          for chunk in response.iter_content(chunk_size=8192):
              f.write(chunk)
      print("image saved as image.png")

let's explain our pipeline :

  • when instantiating the model using pipe = pipeline(...) these parameters will be passed to the __init__ method
  • when calling the previously defined pipeline pipe(...) these parameters will be passed to the _sanitize_parameters method which will split the parameters and pass them to either the :
    • preprocess method : this method is usually used to clean the input, in our case it will load the image, convert it gray, and transform it into a torch tensor
    • _forward method: this method is mostly used to call our model predict the output
    • postprocess method : this method is typically used to clean our output, in our example if the clean_output parameter is not True it will return the raw input, else it will apply argmax and extract the label for us.
    • download_img method : this is a custom method I added to our architecture and it is not needed to create a pipeline. In the example above if the download parameter is true, we call the preprocess method that will download the image

when using pipe(...) we call the following methods in order :

  1. _sanitize_parameters : makes sure where each keyword argument goes
  2. preprocess : cleans the input
  3. _forward : uses the AI
  4. postprocess: cleans the output

Do not forget to save your code in an external file as this will automate the process of pushing our code for us

.
├── MyFolder
│   ├── __init__.py
│   ├── MyConFig.py
│   ├── MyModel.py
│   └── MyPipe.py
└── model.pth

push to hub 🤗

Unfortunately the transformers library does not support the push_to_hub() method yet. so let's work around this by cloning our repo locally, saving our dependencies and pushing everything to the hub.

from huggingface_hub import snapshot_download
from MyFolder.MyPipe import MnistPipe
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline, AutoModelForImageClassification

# clone repo
snapshot_download(repo_id="not-lain/MyRepo",local_dir="MyRepo")

# register pipeline
PIPELINE_REGISTRY.register_pipeline(
    "image-classification", # or any other custom task 
    pipeline_class=MnistPipe,
    pt_model=AutoModelForImageClassification,
    # Optional parameters :
    # select a default revision/branch/commit_hash for the model
    # default={"pt": ("not-lain/MyRepo", "dba8d15072d743b6cb4a707246f801699897fb72")},
    type="image",  # current support type: text, audio, image, multimodal
)
# call the pipeline
pipe = pipeline(
              # Optional : pass the task used above here
              # "image-classification",
              model="not-lain/MyRepo",
              trust_remote_code=True)
# save the pipeline
pipe.save_pretrained('MyRepo')

# upload to 🤗
from huggingface_hub import upload_folder
upload_folder(repo_id="not-lain/MyRepo",folder_path="./MyRepo")

All done, now you can use your new pipeline :

from transformers import pipeline
# no need to specify what task we are using
pipe = pipeline(model="not-lain/MyRepo", trust_remote_code=True)
pipe( "https://cdn.discordapp.com/attachments/753001408663650447/1203351253938216980/8599BQAAAAASUVORK5CYII.png",
    download=True, # will call the download_img method
    clean_output=False # will be passed as postprocess_kwargs
  )
# >> image saved as image.png
# >> tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

pipe("image.png")
# >> 7

pipe.download_img("https://cdn.discordapp.com/attachments/753001408663650447/1203351253938216980/8599BQAAAAASUVORK5CYII.png")
# >> image saved as image.png

Finally add a README.md file to your repo to let people know how to use your custom architecture 🥳

Resources :

Repo custom_code custom_pipeline notes
not-lain/MyRepo small code and easy to follow
vikhyatk/moondream1 big architecture, pipeline can be found here
microsoft/phi-2 🟡 big architecture, working pipeline
Qwen/Qwen-VL-Chat big architecture, no pipeline yet
tiiuae/falcon-7b 🟡 big architecture, working pipeline
briaai/RMBG-1.4 remote code, architecture can be found here

📺 youtube : https://www.youtube.com/watch?v=9gZ7LvEJRBo

🌐 how to reach me : https://not-lain.github.io/