ctaguchi
/

wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000

Automatic Speech Recognition

Inference Endpoints

Model card Files Files and versions Community

wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000 / upload.py

chipewyan

file path change

92cfe12 over 1 year ago

raw history blame contribute delete

No virus

1.56 kB

	from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
	import argparse

	def upload(model_path, vocab_file, directory):
	"""
	Running this function will create additional necessary files such as `special_tokens_map.json`
	and make it ready for the model to be uploaded on the huggingface repository.
	"""

	print("Reading the model...")
	model = Wav2Vec2ForCTC.from_pretrained(model_path)
	print("Model read")

	print("Reading the tokenizer...")
	tokenizer_sentence = Wav2Vec2CTCTokenizer("./{}".format(vocab_file), unk_token="[UNK]",
	pad_token="[PAD]", word_delimiter_token="\|")
	print("Tokenizer read")

	print("Saving the model...")
	model.save_pretrained(directory)
	print("Model saved")
	print("Saving the tokenizer...")
	tokenizer_sentence.save_pretrained(directory)
	print("Tokenizer saved")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-m", "--model_path", default="./", type=str,
	help="Model (checkpoint) directory path")
	parser.add_argument("-v", "--vocab_file", required=True, type=str,
	help="Vocab file path")
	parser.add_argument("-d", "--directory", default="./", type=str,
	help="Destination of the saved model")
	args = parser.parse_args()
	model_path = args.model_path
	vocab_file = args.vocab_file
	directory = args.directory
	upload(model_path, vocab_file, directory)