{ "cells": [ { "cell_type": "code", "execution_count": 20, "id": "04c8de09", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import re" ] }, { "cell_type": "code", "execution_count": 23, "id": "1eae750a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/fr/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)\n" ] } ], "source": [ "dataset = load_dataset(\"mozilla-foundation/common_voice_7_0\", \"fr\", split=\"train\", use_auth_token=True)" ] }, { "cell_type": "code", "execution_count": 24, "id": "da1cfcaa", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c110c54654c045b9a2cbc6cad43fa685", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0ex [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "chars_to_ignore_regex = '[^a-zàâäçéèêëîïôöùûüÿ\\'’ ]'\n", "\n", "def extract_text(batch):\n", " batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", batch[\"sentence\"].lower()).replace('’', \"'\")\n", " return batch\n", "\n", "dataset = dataset.map(extract_text, remove_columns=[\"sentence\"])" ] }, { "cell_type": "code", "execution_count": 25, "id": "bb306916", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d21bc14560b747f49105f598a2ffe2ff", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Pushing dataset shards to the dataset hub: 0%| | 0/29 [00:00\" in line:\n", " write_file.write(line)\n", " write_file.write(line.replace(\"\", \"\"))\n", " has_added_eos = True\n", " else:\n", " write_file.write(line)" ] }, { "cell_type": "code", "execution_count": 1, "id": "07ff4067", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoProcessor" ] }, { "cell_type": "code", "execution_count": 3, "id": "e75ab227", "metadata": {}, "outputs": [], "source": [ "processor = AutoProcessor.from_pretrained(\"./\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "604776b7", "metadata": {}, "outputs": [], "source": [ "vocab_dict = processor.tokenizer.get_vocab()" ] }, { "cell_type": "code", "execution_count": 5, "id": "ef4dd957", "metadata": {}, "outputs": [], "source": [ "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}" ] }, { "cell_type": "code", "execution_count": 6, "id": "9a14839d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the LM will be faster if you build a binary file.\n", "Reading /home/pascal/kenlm/build/bin/xls-r-300m-lm-fr/language_model/5gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n" ] } ], "source": [ "from pyctcdecode import build_ctcdecoder\n", "\n", "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"./language_model/5gram_correct.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "656979ca", "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "id": "d2dd8891", "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(\"xls-r-300m-lm-fr\")" ] }, { "cell_type": "code", "execution_count": null, "id": "85908c6d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 5 }