|
|
|
|
|
|
|
import sys |
|
import os |
|
|
|
from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast, LayoutLMConfig, LayoutLMForMaskedLM |
|
|
|
mname_orig = "microsoft/layoutlm-base-uncased" |
|
mname_tiny = "tiny-layoutlm" |
|
|
|
|
|
|
|
import json |
|
from transformers import AutoTokenizer |
|
from tokenizers import Tokenizer |
|
vocab_keep_items = 5000 |
|
tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True) |
|
assert tokenizer.is_fast, "This only works for fast tokenizers." |
|
tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) |
|
vocab = tokenizer_json["model"]["vocab"] |
|
if tokenizer_json["model"]["type"] == "BPE": |
|
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } |
|
merges = tokenizer_json["model"]["merges"] |
|
new_merges = [] |
|
for i in range(len(merges)): |
|
a, b = merges[i].split() |
|
new_token = "".join((a, b)) |
|
if a in new_vocab and b in new_vocab and new_token in new_vocab: |
|
new_merges.append(merges[i]) |
|
tokenizer_json["model"]["merges"] = new_merges |
|
elif tokenizer_json["model"]["type"] == "Unigram": |
|
new_vocab = vocab[:vocab_keep_items] |
|
elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel": |
|
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } |
|
else: |
|
raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}") |
|
tokenizer_json["model"]["vocab"] = new_vocab |
|
tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) |
|
tokenizer_fast_tiny = tokenizer |
|
|
|
|
|
|
|
|
|
config_tiny = LayoutLMConfig.from_pretrained(mname_orig) |
|
print(config_tiny) |
|
|
|
config_tiny.update(dict( |
|
vocab_size=vocab_keep_items, |
|
hidden_size=32, |
|
intermediate_size=64, |
|
max_position_embeddings=512, |
|
max_2d_position_embeddings=128, |
|
num_attention_heads=2, |
|
num_hidden_layers=2, |
|
)) |
|
print("New config", config_tiny) |
|
|
|
|
|
|
|
|
|
|
|
model_tiny = LayoutLMForMaskedLM(config_tiny) |
|
print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}") |
|
model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny)) |
|
|
|
|
|
inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt") |
|
|
|
outputs = model_tiny(**inputs) |
|
print("Test with normal tokenizer:", len(outputs.logits[0])) |
|
|
|
|
|
model_tiny.half() |
|
model_tiny.save_pretrained(".") |
|
tokenizer_fast_tiny.save_pretrained(".") |
|
|
|
|
|
|
|
readme = "README.md" |
|
if not os.path.exists(readme): |
|
with open(readme, "w") as f: |
|
f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n") |
|
|
|
print(f"Generated {mname_tiny}") |
|
|