#!/usr/bin/env python import sys import os from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast, LayoutLMConfig, LayoutLMForMaskedLM mname_orig = "microsoft/layoutlm-base-uncased" mname_tiny = "tiny-layoutlm" ### Tokenizer import json from transformers import AutoTokenizer from tokenizers import Tokenizer vocab_keep_items = 5000 tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True) assert tokenizer.is_fast, "This only works for fast tokenizers." tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) vocab = tokenizer_json["model"]["vocab"] if tokenizer_json["model"]["type"] == "BPE": new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } merges = tokenizer_json["model"]["merges"] new_merges = [] for i in range(len(merges)): a, b = merges[i].split() new_token = "".join((a, b)) if a in new_vocab and b in new_vocab and new_token in new_vocab: new_merges.append(merges[i]) tokenizer_json["model"]["merges"] = new_merges elif tokenizer_json["model"]["type"] == "Unigram": new_vocab = vocab[:vocab_keep_items] elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel": new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } else: raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}") tokenizer_json["model"]["vocab"] = new_vocab tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) tokenizer_fast_tiny = tokenizer ### Config config_tiny = LayoutLMConfig.from_pretrained(mname_orig) print(config_tiny) # remember to update this to the actual config as each model is different and then shrink the numbers config_tiny.update(dict( vocab_size=vocab_keep_items, hidden_size=32, intermediate_size=64, max_position_embeddings=512, max_2d_position_embeddings=128, num_attention_heads=2, num_hidden_layers=2, )) print("New config", config_tiny) ### Model model_tiny = LayoutLMForMaskedLM(config_tiny) print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}") model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny)) # Test inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt") #print(inputs) outputs = model_tiny(**inputs) print("Test with normal tokenizer:", len(outputs.logits[0])) # Save model_tiny.half() # makes it smaller model_tiny.save_pretrained(".") tokenizer_fast_tiny.save_pretrained(".") #print(model_tiny) readme = "README.md" if not os.path.exists(readme): with open(readme, "w") as f: f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n") print(f"Generated {mname_tiny}")