vitouphy commited on
Commit
bb8c2ed
โ€ข
1 Parent(s): 8c7fe04

add language model

Browse files
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": [" ", "\u1780", "\u1781", "\u1782", "\u1783", "\u1784", "\u1785", "\u1786", "\u1787", "\u1788", "\u1789", "\u178a", "\u178b", "\u178c", "\u178d", "\u178e", "\u178f", "\u1790", "\u1791", "\u1792", "\u1793", "\u1794", "\u1795", "\u1796", "\u1797", "\u1798", "\u1799", "\u179a", "\u179b", "\u179c", "\u179f", "\u17a0", "\u17a1", "\u17a2", "\u17a5", "\u17a7", "\u17aa", "\u17ab", "\u17ac", "\u17ad", "\u17ae", "\u17af", "\u17b1", "\u17b6", "\u17b7", "\u17b8", "\u17b9", "\u17ba", "\u17bb", "\u17bc", "\u17bd", "\u17be", "\u17bf", "\u17c0", "\u17c1", "\u17c2", "\u17c3", "\u17c4", "\u17c5", "\u17c6", "\u17c7", "\u17c8", "\u17c9", "\u17ca", "\u17cb", "\u17cc", "\u17cd", "\u17ce", "\u17cf", "\u17d0", "\u17d2", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
build_lm_processor.ipynb ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "5393aa33",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor, AutoProcessor, Wav2Vec2ProcessorWithLM\n",
11
+ "from datasets import load_dataset, load_metric, Audio\n",
12
+ "from pyctcdecode import build_ctcdecoder\n",
13
+ "from pydub import AudioSegment\n",
14
+ "from pydub.playback import play\n",
15
+ "\n",
16
+ "import numpy as np\n",
17
+ "import torch\n",
18
+ "import kenlm\n",
19
+ "import pandas as pd\n",
20
+ "import random\n",
21
+ "import soundfile as sf\n",
22
+ "from tqdm.auto import tqdm"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "id": "2d34d3b8",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
33
+ "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "id": "f0354cb2",
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "name": "stderr",
44
+ "output_type": "stream",
45
+ "text": [
46
+ "Loading the LM will be faster if you build a binary file.\n",
47
+ "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
48
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
49
+ "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
50
+ "****************************************************************************************************\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "id": "109f28e9",
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "name": "stdout",
66
+ "output_type": "stream",
67
+ "text": [
68
+ "{'|': 0, 'แž€': 1, 'แž': 2, 'แž‚': 3, 'แžƒ': 4, 'แž„': 5, 'แž…': 6, 'แž†': 7, 'แž‡': 8, 'แžˆ': 9, 'แž‰': 10, 'แžŠ': 11, 'แž‹': 12, 'แžŒ': 13, 'แž': 14, 'แžŽ': 15, 'แž': 16, 'แž': 17, 'แž‘': 18, 'แž’': 19, 'แž“': 20, 'แž”': 21, 'แž•': 22, 'แž–': 23, 'แž—': 24, 'แž˜': 25, 'แž™': 26, 'แžš': 27, 'แž›': 28, 'แžœ': 29, 'แžŸ': 30, 'แž ': 31, 'แžก': 32, 'แžข': 33, 'แžฅ': 34, 'แžง': 35, 'แžช': 36, 'แžซ': 37, 'แžฌ': 38, 'แžญ': 39, 'แžฎ': 40, 'แžฏ': 41, 'แžฑ': 42, 'แžถ': 43, 'แžท': 44, 'แžธ': 45, 'แžน': 46, 'แžบ': 47, 'แžป': 48, 'แžผ': 49, 'แžฝ': 50, 'แžพ': 51, 'แžฟ': 52, 'แŸ€': 53, 'แŸ': 54, 'แŸ‚': 55, 'แŸƒ': 56, 'แŸ„': 57, 'แŸ…': 58, 'แŸ†': 59, 'แŸ‡': 60, 'แŸˆ': 61, 'แŸ‰': 62, 'แŸŠ': 63, 'แŸ‹': 64, 'แŸŒ': 65, 'แŸ': 66, 'แŸŽ': 67, 'แŸ': 68, 'แŸ': 69, 'แŸ’': 70, '[unk]': 71, '[pad]': 72, '<s>': 73, '</s>': 74}\n"
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "vocab_dict = processor.tokenizer.get_vocab()\n",
74
+ "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
75
+ "print(sorted_vocab_dict)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "id": "300cec39",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "name": "stderr",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "Loading the LM will be faster if you build a binary file.\n",
89
+ "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
90
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
91
+ "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
92
+ "****************************************************************************************************\n"
93
+ ]
94
+ }
95
+ ],
96
+ "source": [
97
+ "decoder = build_ctcdecoder(\n",
98
+ " labels=list(sorted_vocab_dict.keys()),\n",
99
+ " kenlm_model_path=KENLM_MODEL_LOC,\n",
100
+ ")"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 8,
106
+ "id": "27dd8427",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
111
+ " feature_extractor=processor.feature_extractor,\n",
112
+ " tokenizer=processor.tokenizer,\n",
113
+ " decoder=decoder\n",
114
+ ")"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 9,
120
+ "id": "94eb248e",
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "processor_with_lm.save_pretrained(\".\")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "markdown",
129
+ "id": "8f9b3dcc",
130
+ "metadata": {},
131
+ "source": [
132
+ "## Save Model"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 9,
138
+ "id": "8b584690",
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "application/vnd.jupyter.widget-view+json": {
144
+ "model_id": "bc5bf68946064e97b869d44b02e7af19",
145
+ "version_major": 2,
146
+ "version_minor": 0
147
+ },
148
+ "text/plain": [
149
+ "Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
150
+ ]
151
+ },
152
+ "metadata": {},
153
+ "output_type": "display_data"
154
+ }
155
+ ],
156
+ "source": [
157
+ "model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\")"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 12,
163
+ "id": "3712c030",
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "model.save_pretrained('.')"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "id": "b5d8de20",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": []
177
+ }
178
+ ],
179
+ "metadata": {
180
+ "kernelspec": {
181
+ "display_name": "Python 3 (ipykernel)",
182
+ "language": "python",
183
+ "name": "python3"
184
+ },
185
+ "language_info": {
186
+ "codemirror_mode": {
187
+ "name": "ipython",
188
+ "version": 3
189
+ },
190
+ "file_extension": ".py",
191
+ "mimetype": "text/x-python",
192
+ "name": "python",
193
+ "nbconvert_exporter": "python",
194
+ "pygments_lexer": "ipython3",
195
+ "version": "3.8.8"
196
+ }
197
+ },
198
+ "nbformat": 4,
199
+ "nbformat_minor": 5
200
+ }
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/km_wiki_ngram.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4eae7d94d04e95668df7306edf35e21f4bbab2a73c736b921e531cd25cde6d0
3
+ size 109085039
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff