vitouphy commited on
Commit
23c8c88
1 Parent(s): 3e8f2f7
build_lm_processor.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
- "id": "5393aa33",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -25,32 +25,21 @@
25
  {
26
  "cell_type": "code",
27
  "execution_count": 2,
28
- "id": "2d34d3b8",
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
32
  "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
33
- "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
 
34
  ]
35
  },
36
  {
37
  "cell_type": "code",
38
  "execution_count": 3,
39
- "id": "f0354cb2",
40
  "metadata": {},
41
- "outputs": [
42
- {
43
- "name": "stderr",
44
- "output_type": "stream",
45
- "text": [
46
- "Loading the LM will be faster if you build a binary file.\n",
47
- "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
48
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
49
- "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
50
- "****************************************************************************************************\n"
51
- ]
52
- }
53
- ],
54
  "source": [
55
  "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
56
  ]
@@ -58,7 +47,7 @@
58
  {
59
  "cell_type": "code",
60
  "execution_count": 4,
61
- "id": "109f28e9",
62
  "metadata": {},
63
  "outputs": [
64
  {
@@ -78,18 +67,16 @@
78
  {
79
  "cell_type": "code",
80
  "execution_count": 5,
81
- "id": "300cec39",
82
  "metadata": {},
83
  "outputs": [
84
  {
85
  "name": "stderr",
86
  "output_type": "stream",
87
  "text": [
88
- "Loading the LM will be faster if you build a binary file.\n",
89
- "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
90
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
91
  "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
92
- "****************************************************************************************************\n"
93
  ]
94
  }
95
  ],
@@ -102,8 +89,8 @@
102
  },
103
  {
104
  "cell_type": "code",
105
- "execution_count": 8,
106
- "id": "27dd8427",
107
  "metadata": {},
108
  "outputs": [],
109
  "source": [
@@ -116,8 +103,8 @@
116
  },
117
  {
118
  "cell_type": "code",
119
- "execution_count": 9,
120
- "id": "94eb248e",
121
  "metadata": {},
122
  "outputs": [],
123
  "source": [
@@ -126,7 +113,7 @@
126
  },
127
  {
128
  "cell_type": "markdown",
129
- "id": "8f9b3dcc",
130
  "metadata": {},
131
  "source": [
132
  "## Save Model"
@@ -135,7 +122,7 @@
135
  {
136
  "cell_type": "code",
137
  "execution_count": 9,
138
- "id": "8b584690",
139
  "metadata": {},
140
  "outputs": [
141
  {
@@ -160,7 +147,7 @@
160
  {
161
  "cell_type": "code",
162
  "execution_count": 12,
163
- "id": "3712c030",
164
  "metadata": {},
165
  "outputs": [],
166
  "source": [
@@ -170,7 +157,7 @@
170
  {
171
  "cell_type": "code",
172
  "execution_count": null,
173
- "id": "b5d8de20",
174
  "metadata": {},
175
  "outputs": [],
176
  "source": []
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
+ "id": "57176d39",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
25
  {
26
  "cell_type": "code",
27
  "execution_count": 2,
28
+ "id": "dbc1f98a",
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
32
  "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
33
+ "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'\n",
34
+ "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/kmwiki_5gram.binary'"
35
  ]
36
  },
37
  {
38
  "cell_type": "code",
39
  "execution_count": 3,
40
+ "id": "54d76e5f",
41
  "metadata": {},
42
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
43
  "source": [
44
  "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
45
  ]
 
47
  {
48
  "cell_type": "code",
49
  "execution_count": 4,
50
+ "id": "c76a5c8e",
51
  "metadata": {},
52
  "outputs": [
53
  {
 
67
  {
68
  "cell_type": "code",
69
  "execution_count": 5,
70
+ "id": "8b640127",
71
  "metadata": {},
72
  "outputs": [
73
  {
74
  "name": "stderr",
75
  "output_type": "stream",
76
  "text": [
77
+ "Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.\n",
 
 
78
  "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
79
+ "No known unigrams provided, decoding results might be a lot worse.\n"
80
  ]
81
  }
82
  ],
 
89
  },
90
  {
91
  "cell_type": "code",
92
+ "execution_count": 6,
93
+ "id": "2560c32d",
94
  "metadata": {},
95
  "outputs": [],
96
  "source": [
 
103
  },
104
  {
105
  "cell_type": "code",
106
+ "execution_count": 7,
107
+ "id": "badc19a1",
108
  "metadata": {},
109
  "outputs": [],
110
  "source": [
 
113
  },
114
  {
115
  "cell_type": "markdown",
116
+ "id": "89e517c8",
117
  "metadata": {},
118
  "source": [
119
  "## Save Model"
 
122
  {
123
  "cell_type": "code",
124
  "execution_count": 9,
125
+ "id": "ed9535c8",
126
  "metadata": {},
127
  "outputs": [
128
  {
 
147
  {
148
  "cell_type": "code",
149
  "execution_count": 12,
150
+ "id": "758b6f9a",
151
  "metadata": {},
152
  "outputs": [],
153
  "source": [
 
157
  {
158
  "cell_type": "code",
159
  "execution_count": null,
160
+ "id": "3166a19b",
161
  "metadata": {},
162
  "outputs": [],
163
  "source": []
language_model/unigrams.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}