from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer import argparse def upload(model_path, vocab_file, directory): """ Running this function will create additional necessary files such as `special_tokens_map.json` and make it ready for the model to be uploaded on the huggingface repository. """ print("Reading the model...") model = Wav2Vec2ForCTC.from_pretrained(model_path) print("Model read") print("Reading the tokenizer...") tokenizer_sentence = Wav2Vec2CTCTokenizer("./{}".format(vocab_file), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") print("Tokenizer read") print("Saving the model...") model.save_pretrained(directory) print("Model saved") print("Saving the tokenizer...") tokenizer_sentence.save_pretrained(directory) print("Tokenizer saved") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-m", "--model_path", default="./", type=str, help="Model (checkpoint) directory path") parser.add_argument("-v", "--vocab_file", required=True, type=str, help="Vocab file path") parser.add_argument("-d", "--directory", default="./", type=str, help="Destination of the saved model") args = parser.parse_args() model_path = args.model_path vocab_file = args.vocab_file directory = args.directory upload(model_path, vocab_file, directory)