Edit model card

AlephBertGimmel

Modern Hebrew pretrained BERT model with a 128K token vocabulary.

Checkpoint of the alephbertgimmel-small-128 from alephbertgimmel

from transformers import AutoTokenizer, AutoModelForMaskedLM


import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

model = AutoModelForMaskedLM.from_pretrained("imvladikon/alephbertgimmel-small-128")
tokenizer = AutoTokenizer.from_pretrained("imvladikon/alephbertgimmel-small-128")

text = "{} 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛"

input = tokenizer.encode(text.format("[MASK]"), return_tensors="pt")
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

token_logits = model(input).logits
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(text.format(tokenizer.decode([token])))

# 讬砖专讗诇 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讬专讜砖诇讬诐 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讞讬驻讛 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讗讬诇转 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讗砖讚讜讚 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
def ppl_naive(text, model, tokenizer):
    input = tokenizer.encode(text, return_tensors="pt")
    loss = model(input, labels=input)[0]
    return torch.exp(loss).item()

text = """{} 讛讬讗 注讬专 讛讘讬专讛 砖诇 诪讚讬谞转 讬砖专讗诇, 讜讛注讬专 讛讙讚讜诇讛 讘讬讜转专 讘讬砖专讗诇 讘讙讜讚诇 讛讗讜讻诇讜住讬讬讛"""

for word in ["讞讬驻讛", "讬专讜砖诇讬诐", "转诇 讗讘讬讘"]:
  print(ppl_naive(text.format(word), model, tokenizer))

# 9.825098991394043
# 10.594215393066406
# 9.536449432373047

# I'd expect that for "讬专讜砖诇讬诐" should be the smallest value, but...

@torch.inference_mode()
def ppl_pseudo(text, model, tokenizer, ignore_idx=-100):
    input = tokenizer.encode(text, return_tensors='pt')
    mask = torch.ones(input.size(-1) - 1).diag(1)[:-2]
    repeat_input = input.repeat(input.size(-1) - 2, 1)
    input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill(input != tokenizer.mask_token_id, ignore_idx)
    loss = model(input, labels=labels)[0]
    return torch.exp(loss).item()


for word in ["讞讬驻讛", "讬专讜砖诇讬诐", "转诇 讗讘讬讘"]:
    print(ppl_pseudo(text.format(word), model, tokenizer))
# 4.346900939941406
# 3.292382001876831
# 2.732590913772583

When using AlephBertGimmel, please reference:


@misc{guetta2022large,
      title={Large Pre-Trained Models with Extra-Large Vocabularies: A Contrastive Analysis of Hebrew BERT Models and a New One to Outperform Them All}, 
      author={Eylon Guetta and Avi Shmidman and Shaltiel Shmidman and Cheyn Shmuel Shmidman and Joshua Guedalia and Moshe Koppel and Dan Bareket and Amit Seker and Reut Tsarfaty},
      year={2022},
      eprint={2211.15199},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
Downloads last month
2
Safetensors
Model size
78.7M params
Tensor type
F32
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.