{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d23f1f27-fbf4-4fe5-a7b4-17815b23f283", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoProcessor" ] }, { "cell_type": "code", "execution_count": 2, "id": "cdefcb5e-0824-49ef-be73-8788cbb4e2a9", "metadata": {}, "outputs": [], "source": [ "processor = AutoProcessor.from_pretrained(\"chmanoj/xls-r-300m-te\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "ef78538d-ca83-4cd3-824d-1b7928f5bc4e", "metadata": {}, "outputs": [], "source": [ "vocab_dict = processor.tokenizer.get_vocab()\n", "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}" ] }, { "cell_type": "code", "execution_count": 4, "id": "cd355539-6dfb-4978-82a3-905c0236c6c3", "metadata": {}, "outputs": [], "source": [ "from pyctcdecode import build_ctcdecoder" ] }, { "cell_type": "code", "execution_count": 9, "id": "34429a23-a3e5-40ca-be4e-186bf12e1ff4", "metadata": {}, "outputs": [], "source": [ "# !which python\n", "\n", "# !pip install https://github.com/kpu/kenlm/archive/master.zip" ] }, { "cell_type": "code", "execution_count": 5, "id": "21f4fb99-1c19-4a0a-9ac0-90dd38645585", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the LM will be faster if you build a binary file.\n", "Reading /mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te/3gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n", "Unigrams and labels don't seem to agree.\n" ] } ], "source": [ "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"3gram_correct.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "id": "f892aada-710c-4bc2-a11f-c9a35c00870a", "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "5e29f7f7-e116-4c65-9c14-ae7e871390bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 8, "id": "6f5775eb-aece-41fc-a1eb-8bf6f9b8f429", "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(os.getcwd())" ] }, { "cell_type": "code", "execution_count": null, "id": "0e7e4d6f-01d0-4a24-9980-a6583fb6d048", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "c5ea011b-9412-484a-b798-15fb6e338a99", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading language_model/3gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "SUCCESS\n" ] } ], "source": [ "!../kenlm/build/bin/build_binary language_model/3gram_correct.arpa language_model/3gram.bin" ] }, { "cell_type": "code", "execution_count": null, "id": "70c2709b-0b5c-440f-ae9f-11f8045e8fed", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "id": "c5db962f-15f1-4b65-87e3-81e1af14e32e", "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import Repository" ] }, { "cell_type": "code", "execution_count": 15, "id": "d3801f28-cdb5-40cd-b1b9-5a00f8f24720", "metadata": {}, "outputs": [], "source": [ "repo = Repository(local_dir=\".\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c6421313-5d36-45ce-8300-3988985e7239", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "id": "7dcfe5d2-063f-4b34-9fdd-5f025ef9f699", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "19e7f1d4c0ab43b6b006cb848879273d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file language_model/3gram.bin: 0%| | 32.0k/771M [00:00 main\n", "\n" ] }, { "data": { "text/plain": [ "'https://huggingface.co/chmanoj/xls-r-300m-te/commit/dbca3b5d87436c5615b2460922b94a15a878c713'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repo.push_to_hub(commit_message=\"Upload lm-boosted decoder\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c3fa2899-59f9-458b-8a23-4da3936a18a1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "c71ab8cb-8732-4d40-aa77-503421ac717c", "metadata": {}, "source": [ "## Evaluation" ] }, { "cell_type": "code", "execution_count": null, "id": "738524cc-28fb-4bb3-aec5-10d1e33bae45", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "99c4aac1-9fe8-4ff5-a0a2-fbe59d6ad2d2", "metadata": {}, "outputs": [], "source": [ "#!python eval.py --model_id=\"chmanoj/xls-r-300m-te\" --dataset=\"openslr_SLR66\" --config=\"te\" --split=\"test\" --log_outputs" ] }, { "cell_type": "code", "execution_count": null, "id": "c012e9c8-cc46-48d5-a05c-99c136591c9f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "47d37b88-cc8e-4d17-b070-4ad1cd66dae8", "metadata": {}, "outputs": [], "source": [ "from huggingface_hub.repocard import metadata_load" ] }, { "cell_type": "code", "execution_count": 18, "id": "a56f846c-fa92-48d5-873e-3788748dd9e8", "metadata": {}, "outputs": [], "source": [ "x = metadata_load('README.md')" ] }, { "cell_type": "code", "execution_count": 19, "id": "69d92b93-3a67-4be8-9b9b-ade6322718ae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'language': ['te'],\n", " 'license': 'apache-2.0',\n", " 'tags': ['automatic-speech-recognition',\n", " 'openslr_SLR66',\n", " 'generated_from_trainer',\n", " 'robust-speech-event'],\n", " 'datasets': ['openslr', 'SLR66'],\n", " 'metrics': ['wer'],\n", " 'model-index': [{'name': 'xls-r-300m-te',\n", " 'results': [{'task': {'type': 'automatic-speech-recognition',\n", " 'name': 'Speech Recognition'},\n", " 'dataset': {'type': 'openslr', 'name': 'Open SLR', 'args': 'SLR66'},\n", " 'metrics': [{'type': 'wer',\n", " 'value': 24.695121951219512,\n", " 'name': 'Test WER'},\n", " {'type': 'cer', 'value': 4.861934182322532, 'name': 'Test CER'}]}]}]}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x" ] }, { "cell_type": "code", "execution_count": null, "id": "06957371-efbe-4175-9a3e-4b3c6c6ff255", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b6e6034e-5962-4504-a8a3-9f144d92d37a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ba10bc4c-5fb3-4da7-a7d0-8b51205262de", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }