imvladikon's picture
Update README.md (#2)
ba3d1bb
metadata
language:
  - he
tags:
  - language model

AlephBertGimmel

Modern Hebrew pretrained BERT model with a 128K token vocabulary.

Checkpoint of the alephbertgimmel-base-512 from alephbertgimmel

from transformers import AutoTokenizer, AutoModelForMaskedLM


import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

model = AutoModelForMaskedLM.from_pretrained("imvladikon/alephbertgimmel-base-512")
tokenizer = AutoTokenizer.from_pretrained("imvladikon/alephbertgimmel-base-512")

text = "{} 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛"

input = tokenizer.encode(text.format("[MASK]"), return_tensors="pt")
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

token_logits = model(input).logits
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(text.format(tokenizer.decode([token])))

# 讛注讬专 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讬专讜砖诇讬诐 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讞讬驻讛 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 诇讜谞讚讜谉 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讗讬诇转 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
def ppl_naive(text, model, tokenizer):
    input = tokenizer.encode(text, return_tensors="pt")
    loss = model(input, labels=input)[0]
    return torch.exp(loss).item()

text = """{} 讛讬讗 注讬专 讛讘讬专讛 砖诇 诪讚讬谞转 讬砖专讗诇, 讜讛注讬专 讛讙讚讜诇讛 讘讬讜转专 讘讬砖专讗诇 讘讙讜讚诇 讛讗讜讻诇讜住讬讬讛"""

for word in ["讞讬驻讛", "讬专讜砖诇讬诐", "转诇 讗讘讬讘"]:
  print(ppl_naive(text.format(word), model, tokenizer))

# 10.181422233581543
# 9.743313789367676
# 10.171016693115234

When using AlephBertGimmel, please reference:


@misc{gueta2022large,
      title={Large Pre-Trained Models with Extra-Large Vocabularies: A Contrastive Analysis of Hebrew BERT Models and a New One to Outperform Them All}, 
      author={Eylon Gueta and Avi Shmidman and Shaltiel Shmidman and Cheyn Shmuel Shmidman and Joshua Guedalia and Moshe Koppel and Dan Bareket and Amit Seker and Reut Tsarfaty},
      year={2022},
      eprint={2211.15199},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}