A SentencePiece tokenizer trained on Azerbaijani-only DOLLMA dataset.
Citation
https://huggingface.co/papers/2407.02337
BibTeX:
@inproceedings{isbarov-etal-2024-open,
title = "Open foundation models for {A}zerbaijani language",
author = "Isbarov, Jafar and
Huseynova, Kavsar and
Mammadov, Elvin and
Hajili, Mammad and
Ataman, Duygu",
editor = {Ataman, Duygu and
Derin, Mehmet Oguz and
Ivanova, Sardana and
K{\"o}ksal, Abdullatif and
S{\"a}lev{\"a}, Jonne and
Zeyrek, Deniz},
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Turkic Languages (SIGTURK 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand and Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigturk-1.2",
pages = "18--28"
}