{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "db2971a9", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "29f15da8fd9549188347df46955b078d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
\\n 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n" ] } ], "source": [ "from pyctcdecode import build_ctcdecoder\n", "\n", "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"5gram.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "46585ac6", "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "c17befdc", "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(\"wav2vec2-bn-300m\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "f3ec60c4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Adding files tracked by Git LFS: ['language_model/unigrams.txt']. This may take a bit of time if the files are large.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7aa6e28e8a9c49b79b09f5d2884383d7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file language_model/unigrams.txt: 0%| | 3.38k/22.3M [00:00 main\n", "\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/Tahsin-Mayeesha/wav2vec2-bn-300m/commit/258816acfe8e1e49f41b4edcf9f20f812b4bf00d'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repo.push_to_hub(commit_message=\"Upload lm-boosted decoder\")" ] }, { "cell_type": "code", "execution_count": null, "id": "add2d4ca", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }