flax-community
/

gpt-neo-1.3B-persian

Text Generation

Inference Endpoints

Model card Files Files and versions Community

gpt-neo-1.3B-persian / src /run_tokenizer.sh

m3hrdadfi's picture

Initialize

69dd1b0 almost 3 years ago

raw history blame contribute delete

No virus

785 Bytes

	#!/bin/bash

	export LC_ALL=C.UTF-8
	export LANG=C.UTF-8

	export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
	export DATASET_NAME=oscar
	export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
	export VOCAB_SIZE=50000
	export MIN_FREQUENCY=2
	export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<\|endoftext\|>','<\|startoftext\|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]'


	python src/train_tokenizer.py \
	--output_dir="$OUTPUT_DIR" \
	--dataset_name="$DATASET_NAME" \
	--dataset_config_name="$DATASET_CONFIG_NAME" \
	--vocab_size=$VOCAB_SIZE \
	--min_frequency=$MIN_FREQUENCY \
	--special_tokens="$SPECIAL_TOKENS"