{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "0f0b980b", "metadata": {}, "outputs": [], "source": [ "import transformers\n", "from datasets import load_dataset\n", "import re" ] }, { "cell_type": "code", "execution_count": 11, "id": "55293a8b", "metadata": {}, "outputs": [], "source": [ "username = \"Plim\" # change to your username\n", "target_lang = \"fr\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "3b5a735e", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f230feb459c441a9a11e53b867e8914a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/2.60k [00:00\" in line:\n", " write_file.write(line)\n", " write_file.write(line.replace(\"\", \"\"))\n", " has_added_eos = True\n", " else:\n", " write_file.write(line)" ] }, { "cell_type": "code", "execution_count": 4, "id": "3fdae043", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoProcessor\n", "\n", "processor = AutoProcessor.from_pretrained(\"./\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "2c3c2b49", "metadata": {}, "outputs": [], "source": [ "vocab_dict = processor.tokenizer.get_vocab()\n", "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}" ] }, { "cell_type": "code", "execution_count": 6, "id": "103034b6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the LM will be faster if you build a binary file.\n", "Reading /workspace/xls-r-1b-cv_8-fr/language_model/5gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n" ] } ], "source": [ "from pyctcdecode import build_ctcdecoder\n", "\n", "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"language_model/5gram_correct.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "40dd5ada", "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "754cd832", "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(\"Plim/xls-r-1b-cv_8-fr\")" ] }, { "cell_type": "code", "execution_count": null, "id": "577d9617", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }