omarkamali commited on
Commit
bb8d10b
·
verified ·
1 Parent(s): d1c185c

Upload all models and assets for ca (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +347 -136
  3. models/embeddings/aligned/ca_128d.bin +3 -0
  4. models/embeddings/aligned/ca_128d.meta.json +1 -0
  5. models/embeddings/aligned/ca_128d.projection.npy +3 -0
  6. models/embeddings/aligned/ca_128d_metadata.json +8 -0
  7. models/embeddings/aligned/ca_32d.bin +3 -0
  8. models/embeddings/aligned/ca_32d.meta.json +1 -0
  9. models/embeddings/aligned/ca_32d.projection.npy +3 -0
  10. models/embeddings/aligned/ca_32d_metadata.json +8 -0
  11. models/embeddings/aligned/ca_64d.bin +3 -0
  12. models/embeddings/aligned/ca_64d.meta.json +1 -0
  13. models/embeddings/aligned/ca_64d.projection.npy +3 -0
  14. models/embeddings/aligned/ca_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/ca_128d.bin +2 -2
  16. models/embeddings/monolingual/ca_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/ca_32d.bin +2 -2
  18. models/embeddings/monolingual/ca_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/ca_64d.bin +2 -2
  20. models/embeddings/monolingual/ca_64d_metadata.json +5 -3
  21. models/subword_markov/ca_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/ca_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/ca_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/ca_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/ca_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/ca_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/ca_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/ca_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/ca_2gram_subword.parquet +2 -2
  30. models/subword_ngram/ca_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/ca_3gram_subword.parquet +2 -2
  32. models/subword_ngram/ca_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/ca_4gram_subword.parquet +2 -2
  34. models/subword_ngram/ca_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/ca_5gram_subword.parquet +3 -0
  36. models/subword_ngram/ca_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/ca_tokenizer_16k.model +2 -2
  38. models/tokenizer/ca_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/ca_tokenizer_32k.model +2 -2
  40. models/tokenizer/ca_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/ca_tokenizer_64k.model +2 -2
  42. models/tokenizer/ca_tokenizer_64k.vocab +0 -0
  43. models/tokenizer/ca_tokenizer_8k.model +2 -2
  44. models/tokenizer/ca_tokenizer_8k.vocab +0 -0
  45. models/vocabulary/ca_vocabulary.parquet +2 -2
  46. models/vocabulary/ca_vocabulary_metadata.json +10 -9
  47. models/vocabulary/ca_vocabulary_top.parquet +3 -0
  48. models/vocabulary/ca_vocabulary_top_metadata.json +20 -0
  49. models/word_markov/ca_markov_ctx1_word.parquet +2 -2
  50. models/word_markov/ca_markov_ctx1_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-romance_galloitalic
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,14 +33,14 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.298
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.7184
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 1000000
33
- generated: 2025-12-28
34
  ---
35
 
36
  # Catalan - Wikilangs Models
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,53 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.555x | 3.53 | 0.1339% | 4,215,094 |
76
- | **16k** | 3.867x | 3.84 | 0.1457% | 3,874,434 |
77
- | **32k** | 4.116x | 4.09 | 0.1551% | 3,639,962 |
78
- | **64k** | 4.298x 🏆 | 4.27 | 0.1619% | 3,486,449 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `Guilherand-Granges és un municipi de la regió d'Alvèrnia-Roine-Alps i el departa...`
85
 
86
  | Vocab | Tokens | Count |
87
  |-------|--------|-------|
88
- | 8k | `▁gu il her and - gran ges ▁és ▁unmunicipi ... (+35 more)` | 45 |
89
- | 16k | `▁gu il her and - gran ges ▁és ▁unmunicipi ... (+33 more)` | 43 |
90
- | 32k | `▁guil her and - gran ges ▁és ▁unmunicipide ... (+29 more)` | 39 |
91
- | 64k | `▁guil her and - gran ges ▁és unmunicipide ... (+27 more)` | 37 |
92
 
93
- **Sample 2:** `Estheria (crustaci), un gènere de crustacis del període Carbonífer
94
- Estheria (dí...`
95
 
96
  | Vocab | Tokens | Count |
97
  |-------|--------|-------|
98
- | 8k | `▁est h eria ▁( cr usta ci ), ▁un ▁gènere ... (+32 more)` | 42 |
99
- | 16k | `▁est h eria ▁( cr usta ci ), ▁un ▁gènere ... (+29 more)` | 39 |
100
- | 32k | `▁est h eria ▁( cr usta ci ), ▁un ▁gènere ... (+27 more)` | 37 |
101
- | 64k | `▁est h eria ▁( cr usta ci ), ▁un ▁gènere ... (+23 more)` | 33 |
102
 
103
- **Sample 3:** `Torneig de tennis masculí: St. Petersburg Open 2021
104
- Torneig de tennis femení: S...`
105
 
106
  | Vocab | Tokens | Count |
107
  |-------|--------|-------|
108
- | 8k | `▁torneig ▁de ▁ten nis ▁mascul í : ▁st . ▁peters ... (+28 more)` | 38 |
109
- | 16k | `▁torneig ▁de ▁tennis ▁masculí : ▁st .petersburg ▁open ▁ ... (+21 more)` | 31 |
110
- | 32k | `▁torneig ▁de ▁tennis ▁masculí : ▁st .petersburg ▁open ▁ ... (+21 more)` | 31 |
111
- | 64k | `▁torneig ▁de ▁tennis ▁masculí : ▁st .petersburg ▁open ▁ ... (+19 more)` | 29 |
112
 
113
 
114
  ### Key Findings
115
 
116
- - **Best Compression:** 64k achieves 4.298x compression
117
- - **Lowest UNK Rate:** 8k with 0.1339% unknown tokens
118
  - **Trade-off:** Larger vocabularies improve compression but increase model size
119
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
120
 
@@ -123,57 +139,111 @@ Below are sample sentences tokenized with each vocabulary size:
123
 
124
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
125
 
 
 
126
  ![N-gram Coverage](visualizations/ngram_coverage.png)
127
 
128
  ### Results
129
 
130
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
131
- |--------|------------|---------|----------------|------------------|-------------------|
132
- | **2-gram** | 115,248 🏆 | 16.81 | 4,858,651 | 14.1% | 27.6% |
133
- | **2-gram** | 310 🏆 | 8.28 | 50,468 | 65.3% | 98.2% |
134
- | **3-gram** | 1,102,520 | 20.07 | 16,377,428 | 4.5% | 12.5% |
135
- | **3-gram** | 2,693 | 11.40 | 370,834 | 27.1% | 69.0% |
136
- | **4-gram** | 4,349,805 | 22.05 | 36,310,673 | 2.3% | 8.2% |
137
- | **4-gram** | 16,302 | 13.99 | 2,376,653 | 13.3% | 38.2% |
 
 
138
 
139
  ### Top 5 N-grams by Size
140
 
141
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  | Rank | N-gram | Count |
144
  |------|--------|-------|
145
- | 1 | `l '` | 6,103,238 |
146
- | 2 | `d '` | 5,990,435 |
147
- | 3 | `de la` | 3,858,095 |
148
- | 4 | `categoria :` | 2,458,133 |
149
- | 5 | `a la` | 1,831,941 |
150
 
151
- **3-grams:**
152
 
153
  | Rank | N-gram | Count |
154
  |------|--------|-------|
155
- | 1 | `de l '` | 1,783,419 |
156
- | 2 | `a l '` | 1,006,018 |
157
- | 3 | `| | |` | 637,045 |
158
- | 4 | `. l '` | 491,768 |
159
- | 5 | `d ' una` | 438,363 |
160
 
161
- **4-grams:**
162
 
163
  | Rank | N-gram | Count |
164
  |------|--------|-------|
165
- | 1 | `| | | |` | 320,057 |
166
- | 2 | `. referències categoria :` | 191,608 |
167
- | 3 | `categoria : naixements del` | 165,169 |
168
- | 4 | `- | | |` | 150,434 |
169
- | 5 | `d ' octubre de` | 137,349 |
170
 
171
 
172
  ### Key Findings
173
 
174
- - **Best Perplexity:** 2-gram with 310
175
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
176
- - **Coverage:** Top-1000 patterns cover ~38% of corpus
177
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
178
 
179
  ---
@@ -181,55 +251,86 @@ Below are sample sentences tokenized with each vocabulary size:
181
 
182
  ![Markov Entropy](visualizations/markov_entropy.png)
183
 
 
 
184
  ![Markov Branching](visualizations/markov_branching.png)
185
 
186
  ### Results
187
 
188
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
189
- |---------|-------------|------------|------------------|-----------------|----------------|
190
- | **1** | 0.6354 | 1.553 | 8.32 | 4,839,481 | 36.5% |
191
- | **1** | 1.0355 | 2.050 | 9.71 | 25,162 | 0.0% |
192
- | **2** | 0.4773 | 1.392 | 3.30 | 40,230,784 | 52.3% |
193
- | **2** | 0.6327 | 1.550 | 4.11 | 244,427 | 36.7% |
194
- | **3** | 0.2736 | 1.209 | 1.81 | 132,591,901 | 72.6% |
195
- | **3** | 0.7103 | 1.636 | 4.34 | 1,004,684 | 29.0% |
196
- | **4** | 0.1501 🏆 | 1.110 | 1.34 | 239,447,638 | 85.0% |
197
- | **4** | 0.7156 🏆 | 1.642 | 3.65 | 4,363,359 | 28.4% |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- ### Generated Text Samples
200
 
201
- Below are text samples generated from each Markov chain model:
 
 
202
 
203
  **Context Size 1:**
204
 
205
- 1. `de londres categoria : districte és un programador en carta de suècia ruben / 1982 )`
206
- 2. `, la tasca de nyerros que feia constar qui li ocasionaren la figura femenina de missouri`
207
- 3. `. en un planell inferior del castell d ' aquestes àrees a l ' un grup`
208
 
209
  **Context Size 2:**
210
 
211
- 1. `l ' antiguitat i entre el 626 els àvars de la majoria dels parlants d ' aliments`
212
- 2. `d ' àmbits els quals louis companyo formà el 1918 lhasa va causar un accident cerebrovascular ,`
213
- 3. `de la influència dels descendents de dalmau i ribalta ( 1900 ) hindoo jugglers ( 1914 )`
214
 
215
  **Context Size 3:**
216
 
217
- 1. `de l ' àlbum d ' estudi , en el seu camí per trobar intuïtivament i de sobte`
218
- 2. `a l ' hipotàlem i la suprarenal , una glàndula intramandibular inflada que s ' ha presentat a`
219
- 3. `| | | | — | - id = 312 bgcolor = # d6d6d6 | 459311 | |`
220
 
221
  **Context Size 4:**
222
 
223
- 1. `| | | | 6 d ' abril , 2002 | | palomar | | neat | - |`
224
- 2. `. referències categoria : òperes de gaetano donizetti categoria : òperes del 1922 categoria : morts ...`
225
- 3. `categoria : naixements del 1914 categoria : morts el 2023 categoria : morts a bagdad categoria : mor...`
226
 
227
 
228
  ### Key Findings
229
 
230
- - **Best Predictability:** Context-4 with 85.0% predictability
231
  - **Branching Factor:** Decreases with context size (more deterministic)
232
- - **Memory Trade-off:** Larger contexts require more storage (4,363,359 contexts)
233
  - **Recommendation:** Context-3 or Context-4 for text generation
234
 
235
  ---
@@ -245,64 +346,64 @@ Below are text samples generated from each Markov chain model:
245
 
246
  | Metric | Value |
247
  |--------|-------|
248
- | Vocabulary Size | 1,000,000 |
249
- | Total Tokens | 394,566,173 |
250
- | Mean Frequency | 394.57 |
251
- | Median Frequency | 9 |
252
- | Frequency Std Dev | 36714.18 |
253
 
254
  ### Most Common Words
255
 
256
  | Rank | Word | Frequency |
257
  |------|------|-----------|
258
- | 1 | de | 24,313,844 |
259
- | 2 | la | 12,938,517 |
260
- | 3 | i | 9,983,050 |
261
- | 4 | a | 9,644,831 |
262
- | 5 | el | 8,858,221 |
263
- | 6 | l | 6,235,709 |
264
- | 7 | d | 6,156,292 |
265
- | 8 | en | 5,560,129 |
266
- | 9 | del | 5,289,798 |
267
- | 10 | que | 4,942,373 |
268
 
269
  ### Least Common Words (from vocabulary)
270
 
271
  | Rank | Word | Frequency |
272
  |------|------|-----------|
273
- | 1 | cesel | 3 |
274
- | 2 | epinic | 3 |
275
- | 3 | deplexión | 3 |
276
- | 4 | ε³ | 3 |
277
- | 5 | α³ | 3 |
278
- | 6 | engelska | 3 |
279
- | 7 | rechercheconsultation | 3 |
280
- | 8 | pdfir | 3 |
281
- | 9 | βασίλιος | 3 |
282
- | 10 | βασιλείος | 3 |
283
 
284
  ### Zipf's Law Analysis
285
 
286
  | Metric | Value |
287
  |--------|-------|
288
- | Zipf Coefficient | 1.0318 |
289
- | R² (Goodness of Fit) | 0.994658 |
290
  | Adherence Quality | **excellent** |
291
 
292
  ### Coverage Analysis
293
 
294
  | Top N Words | Coverage |
295
  |-------------|----------|
296
- | Top 100 | 43.6% |
297
- | Top 1,000 | 63.1% |
298
  | Top 5,000 | 78.5% |
299
- | Top 10,000 | 84.3% |
300
 
301
  ### Key Findings
302
 
303
- - **Zipf Compliance:** R²=0.9947 indicates excellent adherence to Zipf's law
304
- - **High Frequency Dominance:** Top 100 words cover 43.6% of corpus
305
- - **Long Tail:** 990,000 words needed for remaining 15.7% coverage
306
 
307
  ---
308
  ## 5. Word Embeddings Evaluation
@@ -315,24 +416,131 @@ Below are text samples generated from each Markov chain model:
315
 
316
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
317
 
318
- ### Model Comparison
319
 
320
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
321
- |-------|------------|-----------|----------|----------|----------|
322
- | **mono_32d** | 1,514,898 | 32 | 3.264 | 1.334 | 0.7184 🏆 |
323
- | **mono_64d** | 1,514,898 | 64 | 3.636 | 1.309 | 0.7113 |
324
- | **mono_128d** | 1,514,898 | 128 | 4.070 | 1.306 | 0.6648 |
325
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
326
 
327
  ### Key Findings
328
 
329
- - **Best Isotropy:** mono_32d with 0.7184 (more uniform distribution)
330
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
331
- - **Vocabulary Coverage:** All models cover 1,514,898 words
332
- - **Recommendation:** 100d for balanced semantic capture and efficiency
333
 
334
  ---
335
- ## 6. Summary & Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  ![Performance Dashboard](visualizations/performance_dashboard.png)
338
 
@@ -340,11 +548,12 @@ Below are text samples generated from each Markov chain model:
340
 
341
  | Component | Recommended | Rationale |
342
  |-----------|-------------|-----------|
343
- | Tokenizer | **32k BPE** | Best compression (4.30x) with low UNK rate |
344
- | N-gram | **5-gram** | Lowest perplexity (310) |
345
- | Markov | **Context-4** | Highest predictability (85.0%) |
346
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
347
 
 
348
  ---
349
  ## Appendix: Metrics Glossary & Interpretation Guide
350
 
@@ -534,7 +743,8 @@ If you use these models in your research, please cite:
534
  author = {Kamali, Omar},
535
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
536
  year = {2025},
537
- publisher = {HuggingFace},
 
538
  url = {https://huggingface.co/wikilangs}
539
  institution = {Omneity Labs}
540
  }
@@ -550,7 +760,8 @@ MIT License - Free for academic and commercial use.
550
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
551
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
552
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
553
  ---
554
  *Generated by Wikilangs Models Pipeline*
555
 
556
- *Report Date: 2025-12-28 16:22:11*
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-romance_galloitalic
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.448
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.7469
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-08
44
  ---
45
 
46
  # Catalan - Wikilangs Models
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.608x | 3.61 | 0.1295% | 3,980,202 |
94
+ | **16k** | 3.955x | 3.96 | 0.1420% | 3,630,953 |
95
+ | **32k** | 4.237x | 4.24 | 0.1521% | 3,389,435 |
96
+ | **64k** | 4.448x 🏆 | 4.45 | 0.1597% | 3,228,954 |
97
 
98
  ### Tokenization Examples
99
 
100
  Below are sample sentences tokenized with each vocabulary size:
101
 
102
+ **Sample 1:** `Llista de topònims (noms propis de lloc) del municipi de Capmany, a l'Alt Empord...`
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
+ | 8k | `▁llista ▁de ▁topònims ▁( nom s ▁propisde ▁lloc ) ... (+13 more)` | 23 |
107
+ | 16k | `▁llista ▁de ▁topònims ▁( nom s ▁propisde ▁lloc ) ... (+13 more)` | 23 |
108
+ | 32k | `▁llista ▁de ▁topònims ▁( nom s ▁propisdelloc ) ... (+12 more)` | 22 |
109
+ | 64k | `▁llista ▁de ▁topònims ▁( noms ▁propisdelloc ) del ... (+10 more)` | 20 |
110
 
111
+ **Sample 2:** `Trànsportni (Krasnodar), poble del krai de Krasnodar, a Rússia Trànsportni (Maga...`
 
112
 
113
  | Vocab | Tokens | Count |
114
  |-------|--------|-------|
115
+ | 8k | `▁tr àn s port ni ▁( k ras n od ... (+39 more)` | 49 |
116
+ | 16k | `▁tràn sport ni ▁( k ras n od ar ), ... (+33 more)` | 43 |
117
+ | 32k | `▁tràn sport ni ▁( k ras n odar ), ▁poble ... (+27 more)` | 37 |
118
+ | 64k | `▁tràn sport ni ▁( k ras n odar ), ▁poble ... (+25 more)` | 35 |
119
 
120
+ **Sample 3:** `Torneigs de tennis masculí: Serbia Open (ATP 250) Belgrade Open (ATP 250) Tornei...`
 
121
 
122
  | Vocab | Tokens | Count |
123
  |-------|--------|-------|
124
+ | 8k | `▁torneig s ▁de ▁ten nis ▁mascul í : ▁ser bia ... (+44 more)` | 54 |
125
+ | 16k | `▁torneig s ▁de ▁tennis ▁masculí : ▁ser bia ▁open ▁( ... (+38 more)` | 48 |
126
+ | 32k | `▁torneigs ▁de ▁tennis ▁masculí : ▁ser bia ▁open ▁( atp ... (+34 more)` | 44 |
127
+ | 64k | `▁torneigs ▁de ▁tennis ▁masculí : ▁ser bia ▁open ▁( atp ... (+33 more)` | 43 |
128
 
129
 
130
  ### Key Findings
131
 
132
+ - **Best Compression:** 64k achieves 4.448x compression
133
+ - **Lowest UNK Rate:** 8k with 0.1295% unknown tokens
134
  - **Trade-off:** Larger vocabularies improve compression but increase model size
135
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
136
 
 
139
 
140
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
141
 
142
+ ![N-gram Unique](visualizations/ngram_unique.png)
143
+
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
150
+ | **2-gram** | Word | 167,717 | 17.36 | 4,576,334 | 10.6% | 23.4% |
151
+ | **2-gram** | Subword | 262 🏆 | 8.03 | 41,609 | 69.0% | 98.9% |
152
+ | **3-gram** | Word | 1,409,334 | 20.43 | 13,479,698 | 2.7% | 10.3% |
153
+ | **3-gram** | Subword | 2,211 | 11.11 | 288,734 | 29.3% | 72.4% |
154
+ | **4-gram** | Word | 4,798,593 | 22.19 | 27,616,287 | 1.8% | 7.6% |
155
+ | **4-gram** | Subword | 13,232 | 13.69 | 1,676,138 | 14.2% | 40.2% |
156
+ | **5-gram** | Word | 4,523,219 | 22.11 | 21,934,897 | 2.3% | 8.8% |
157
+ | **5-gram** | Subword | 58,187 | 15.83 | 6,034,155 | 7.7% | 24.2% |
158
 
159
  ### Top 5 N-grams by Size
160
 
161
+ **2-grams (Word):**
162
+
163
+ | Rank | N-gram | Count |
164
+ |------|--------|-------|
165
+ | 1 | `de la` | 3,892,352 |
166
+ | 2 | `a la` | 1,832,648 |
167
+ | 3 | `de l` | 1,806,800 |
168
+ | 4 | `a l` | 1,007,338 |
169
+ | 5 | `de les` | 998,964 |
170
+
171
+ **3-grams (Word):**
172
+
173
+ | Rank | N-gram | Count |
174
+ |------|--------|-------|
175
+ | 1 | `de la seva` | 186,164 |
176
+ | 2 | `per a la` | 131,594 |
177
+ | 3 | `referències enllaços externs` | 121,418 |
178
+ | 4 | `la pel lícula` | 114,682 |
179
+ | 5 | `d octubre de` | 112,980 |
180
+
181
+ **4-grams (Word):**
182
+
183
+ | Rank | N-gram | Count |
184
+ |------|--------|-------|
185
+ | 1 | `de kitt peak spacewatch` | 78,569 |
186
+ | 2 | `de la universitat de` | 56,957 |
187
+ | 3 | `que hi havia el` | 55,303 |
188
+ | 4 | `segons el cens del` | 47,569 |
189
+ | 5 | `de la família dels` | 44,734 |
190
+
191
+ **5-grams (Word):**
192
+
193
+ | Rank | N-gram | Count |
194
+ |------|--------|-------|
195
+ | 1 | `el nombre mitjà de persones` | 43,284 |
196
+ | 2 | `el següent diagrama mostra les` | 42,548 |
197
+ | 3 | `següent diagrama mostra les poblacions` | 42,548 |
198
+ | 4 | `diagrama mostra les poblacions més` | 42,542 |
199
+ | 5 | `mostra les poblacions més properes` | 42,497 |
200
+
201
+ **2-grams (Subword):**
202
+
203
+ | Rank | N-gram | Count |
204
+ |------|--------|-------|
205
+ | 1 | `a _` | 65,660,325 |
206
+ | 2 | `s _` | 52,744,093 |
207
+ | 3 | `_ d` | 49,682,099 |
208
+ | 4 | `e _` | 42,364,044 |
209
+ | 5 | `d e` | 41,208,775 |
210
+
211
+ **3-grams (Subword):**
212
 
213
  | Rank | N-gram | Count |
214
  |------|--------|-------|
215
+ | 1 | `_ d e` | 35,468,647 |
216
+ | 2 | `d e _` | 24,280,649 |
217
+ | 3 | `e s _` | 19,244,620 |
218
+ | 4 | `e l _` | 15,094,409 |
219
+ | 5 | `l a _` | 14,700,214 |
220
 
221
+ **4-grams (Subword):**
222
 
223
  | Rank | N-gram | Count |
224
  |------|--------|-------|
225
+ | 1 | `_ d e _` | 23,793,570 |
226
+ | 2 | `_ l a _` | 12,534,324 |
227
+ | 3 | `_ e l _` | 8,556,406 |
228
+ | 4 | `s _ d e` | 7,523,945 |
229
+ | 5 | `d e _ l` | 7,343,393 |
230
 
231
+ **5-grams (Subword):**
232
 
233
  | Rank | N-gram | Count |
234
  |------|--------|-------|
235
+ | 1 | `_ d e _ l` | 7,323,223 |
236
+ | 2 | `_ d e l _` | 5,191,709 |
237
+ | 3 | `s _ d e _` | 5,107,850 |
238
+ | 4 | `_ q u e _` | 4,821,740 |
239
+ | 5 | `a _ d e _` | 4,540,758 |
240
 
241
 
242
  ### Key Findings
243
 
244
+ - **Best Perplexity:** 2-gram (subword) with 262
245
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
246
+ - **Coverage:** Top-1000 patterns cover ~24% of corpus
247
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
248
 
249
  ---
 
251
 
252
  ![Markov Entropy](visualizations/markov_entropy.png)
253
 
254
+ ![Markov Contexts](visualizations/markov_contexts.png)
255
+
256
  ![Markov Branching](visualizations/markov_branching.png)
257
 
258
  ### Results
259
 
260
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
261
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
262
+ | **1** | Word | 0.9702 | 1.959 | 13.70 | 3,298,751 | 3.0% |
263
+ | **1** | Subword | 0.8467 | 1.798 | 7.10 | 30,691 | 15.3% |
264
+ | **2** | Word | 0.4478 | 1.364 | 2.95 | 45,099,512 | 55.2% |
265
+ | **2** | Subword | 0.5676 | 1.482 | 3.72 | 217,960 | 43.2% |
266
+ | **3** | Word | 0.2425 | 1.183 | 1.66 | 133,056,441 | 75.8% |
267
+ | **3** | Subword | 0.6293 | 1.547 | 3.86 | 810,473 | 37.1% |
268
+ | **4** | Word | 0.1249 🏆 | 1.090 | 1.26 | 221,190,469 | 87.5% |
269
+ | **4** | Subword | 0.6563 | 1.576 | 3.56 | 3,128,822 | 34.4% |
270
+
271
+ ### Generated Text Samples (Word-based)
272
+
273
+ Below are text samples generated from each word-based Markov chain model:
274
+
275
+ **Context Size 1:**
276
+
277
+ 1. `de maig de la temporada l acceptació de muntar una muralla i el molí de la`
278
+ 2. `la població comunicació de encara que alemanya i des de la computació sent l estat substituïda`
279
+ 3. `i no són esmentats anteriorment icv el símbol del psoe des de guilgameix que un comerç`
280
+
281
+ **Context Size 2:**
282
+
283
+ 1. `de la guerra di mario tronti i no solament va trobar que era del 5è al 16è`
284
+ 2. `a la taula de composició amb la seva història general del magistrat monetari c cassi a la`
285
+ 3. `de l expedició del virrei un germà gran del poble ulldeconencs o ulldeconins són coneguts com a`
286
+
287
+ **Context Size 3:**
288
+
289
+ 1. `de la seva carrera periodística escrivint col laboracions a joves intel lectuals pertanyents a l alt...`
290
+ 2. `per a la secció de filosofia i ciències socials en les seves obligacions amb la seguretat i el`
291
+ 3. `referències enllaços externs fira festa de la pasqua hayivky el casament vessilia o ladkannya de la ...`
292
+
293
+ **Context Size 4:**
294
+
295
+ 1. `de kitt peak spacewatch 8 de novembre de parcak i mumford del 8 de novembre de militants del flec`
296
+ 2. `de la universitat de salamanca honoris causa per la universitat christian albrecht de kiel de la uni...`
297
+ 3. `que hi havia el 1 era una gran superfície de material de bricolatge 1 una botiga de congelats 1`
298
 
 
299
 
300
+ ### Generated Text Samples (Subword-based)
301
+
302
+ Below are text samples generated from each subword-based Markov chain model:
303
 
304
  **Context Size 1:**
305
 
306
+ 1. `_daral_euílere_s`
307
+ 2. `eivinde_ditel'hi`
308
+ 3. `agraweros._ome_2`
309
 
310
  **Context Size 2:**
311
 
312
+ 1. `a_ses_va_únivenci`
313
+ 2. `s_als_(rdor_reu_d`
314
+ 3. `_d'ofegria_amb_o_`
315
 
316
  **Context Size 3:**
317
 
318
+ 1. `_de_bre_seteodent_`
319
+ 2. `de_la_de_col·locia`
320
+ 3. `es_pres,_nastorals`
321
 
322
  **Context Size 4:**
323
 
324
+ 1. `_de_doble_(a_−_batx`
325
+ 2. `_la_de_fan_es_va_ca`
326
+ 3. `_el_donar_les_si_es`
327
 
328
 
329
  ### Key Findings
330
 
331
+ - **Best Predictability:** Context-4 (word) with 87.5% predictability
332
  - **Branching Factor:** Decreases with context size (more deterministic)
333
+ - **Memory Trade-off:** Larger contexts require more storage (3,128,822 contexts)
334
  - **Recommendation:** Context-3 or Context-4 for text generation
335
 
336
  ---
 
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
+ | Vocabulary Size | 1,490,582 |
350
+ | Total Tokens | 372,231,757 |
351
+ | Mean Frequency | 249.72 |
352
+ | Median Frequency | 4 |
353
+ | Frequency Std Dev | 29623.92 |
354
 
355
  ### Most Common Words
356
 
357
  | Rank | Word | Frequency |
358
  |------|------|-----------|
359
+ | 1 | de | 23,862,515 |
360
+ | 2 | la | 12,874,088 |
361
+ | 3 | i | 9,923,035 |
362
+ | 4 | a | 9,593,194 |
363
+ | 5 | el | 8,820,173 |
364
+ | 6 | l | 6,195,164 |
365
+ | 7 | d | 5,995,004 |
366
+ | 8 | en | 5,534,785 |
367
+ | 9 | del | 5,257,995 |
368
+ | 10 | que | 4,926,945 |
369
 
370
  ### Least Common Words (from vocabulary)
371
 
372
  | Rank | Word | Frequency |
373
  |------|------|-----------|
374
+ | 1 | binaritruncat | 2 |
375
+ | 2 | fanerozoiques | 2 |
376
+ | 3 | biòmers | 2 |
377
+ | 4 | nianzhi | 2 |
378
+ | 5 | fuching | 2 |
379
+ | 6 | mndm | 2 |
380
+ | 7 | cpsf | 2 |
381
+ | 8 | preestàndard | 2 |
382
+ | 9 | sweetshop | 2 |
383
+ | 10 | whakaata | 2 |
384
 
385
  ### Zipf's Law Analysis
386
 
387
  | Metric | Value |
388
  |--------|-------|
389
+ | Zipf Coefficient | 1.0222 |
390
+ | R² (Goodness of Fit) | 0.996032 |
391
  | Adherence Quality | **excellent** |
392
 
393
  ### Coverage Analysis
394
 
395
  | Top N Words | Coverage |
396
  |-------------|----------|
397
+ | Top 100 | 45.0% |
398
+ | Top 1,000 | 63.8% |
399
  | Top 5,000 | 78.5% |
400
+ | Top 10,000 | 84.2% |
401
 
402
  ### Key Findings
403
 
404
+ - **Zipf Compliance:** R²=0.9960 indicates excellent adherence to Zipf's law
405
+ - **High Frequency Dominance:** Top 100 words cover 45.0% of corpus
406
+ - **Long Tail:** 1,480,582 words needed for remaining 15.8% coverage
407
 
408
  ---
409
  ## 5. Word Embeddings Evaluation
 
416
 
417
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
418
 
 
419
 
420
+ ### 5.1 Cross-Lingual Alignment
421
+
422
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
423
+
424
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
425
+
426
+
427
+ ### 5.2 Model Comparison
428
+
429
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
430
+ |-------|-----------|----------|------------------|---------------|----------------|
431
+ | **mono_32d** | 32 | 0.7469 🏆 | 0.3896 | N/A | N/A |
432
+ | **mono_64d** | 64 | 0.7390 | 0.2972 | N/A | N/A |
433
+ | **mono_128d** | 128 | 0.6902 | 0.2374 | N/A | N/A |
434
+ | **aligned_32d** | 32 | 0.7469 | 0.3696 | 0.4960 | 0.8360 |
435
+ | **aligned_64d** | 64 | 0.7390 | 0.3068 | 0.7200 | 0.9380 |
436
+ | **aligned_128d** | 128 | 0.6902 | 0.2443 | 0.8320 | 0.9720 |
437
 
438
  ### Key Findings
439
 
440
+ - **Best Isotropy:** mono_32d with 0.7469 (more uniform distribution)
441
+ - **Semantic Density:** Average pairwise similarity of 0.3075. Lower values indicate better semantic separation.
442
+ - **Alignment Quality:** Aligned models achieve up to 83.2% R@1 in cross-lingual retrieval.
443
+ - **Recommendation:** 128d aligned for best cross-lingual performance
444
 
445
  ---
446
+ ## 6. Morphological Analysis (Experimental)
447
+
448
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
449
+
450
+ ### 6.1 Productivity & Complexity
451
+
452
+ | Metric | Value | Interpretation | Recommendation |
453
+ |--------|-------|----------------|----------------|
454
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
455
+ | Idiomaticity Gap | **-0.637** | Low formulaic content | - |
456
+
457
+ ### 6.2 Affix Inventory (Productive Units)
458
+
459
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
460
+
461
+ #### Productive Prefixes
462
+ | Prefix | Examples |
463
+ |--------|----------|
464
+ | `-ca` | canadàwilliam, cancells, callissot |
465
+ | `-co` | compsopogon, corlea, constitutionem |
466
+ | `-ma` | matricarina, masaraga, massai |
467
+
468
+ #### Productive Suffixes
469
+ | Suffix | Examples |
470
+ |--------|----------|
471
+ | `-s` | pomacèntrids, pentalobulars, quiotas |
472
+ | `-a` | matricarina, arduinna, yarima |
473
+ | `-es` | asfèriques, biomatemàtiques, quies |
474
+ | `-en` | grieneisen, robien, tensionen |
475
+ | `-is` | rufistrigalis, reaccionaris, catàrsis |
476
+ | `-ia` | praskóvia, llògia, orogenia |
477
+ | `-ta` | lucasta, samudragupta, lisetita |
478
+
479
+ ### 6.3 Bound Stems (Lexical Roots)
480
+
481
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
482
+
483
+ | Stem | Cohesion | Substitutability | Examples |
484
+ |------|----------|------------------|----------|
485
+ | `nter` | 1.39x | 729 contexts | inter, anter, únter |
486
+ | `efer` | 1.66x | 177 contexts | kefer, lefer, defer |
487
+ | `uerr` | 1.61x | 153 contexts | uerra, guerr, duerr |
488
+ | `espr` | 1.73x | 95 contexts | esprî, despr, esprai |
489
+ | `stru` | 1.32x | 389 contexts | strum, struk, strus |
490
+ | `rson` | 1.46x | 205 contexts | rsona, arson, urson |
491
+ | `ient` | 1.31x | 364 contexts | rient, oient, lient |
492
+ | `lmen` | 1.57x | 122 contexts | ulmen, ilmen, olmen |
493
+ | `rinc` | 1.48x | 147 contexts | rinck, rincó, rinca |
494
+ | `ènci` | 1.57x | 107 contexts | ència, mència, lència |
495
+ | `embr` | 1.33x | 234 contexts | membr, embre, embry |
496
+ | `onst` | 1.42x | 159 contexts | onsta, konst, const |
497
+
498
+ ### 6.4 Affix Compatibility (Co-occurrence)
499
+
500
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
501
+
502
+ | Prefix | Suffix | Frequency | Examples |
503
+ |--------|--------|-----------|----------|
504
+ | `-co` | `-s` | 48 words | conventos, conservadors |
505
+ | `-ma` | `-a` | 45 words | masicka, macclureana |
506
+ | `-ca` | `-s` | 40 words | callolepis, cambyses |
507
+ | `-co` | `-a` | 35 words | comunera, costanzana |
508
+ | `-ma` | `-s` | 33 words | mahates, maktens |
509
+ | `-ca` | `-a` | 30 words | camborda, cardellina |
510
+ | `-co` | `-es` | 14 words | congoatlàntiques, colomates |
511
+ | `-ca` | `-es` | 11 words | cambyses, calcídies |
512
+ | `-ma` | `-es` | 9 words | mahates, masies |
513
+ | `-ma` | `-ta` | 9 words | magnesiodumortierita, malwatta |
514
+
515
+ ### 6.5 Recursive Morpheme Segmentation
516
+
517
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
518
+
519
+ | Word | Suggested Split | Confidence | Stem |
520
+ |------|-----------------|------------|------|
521
+ | guerrista | **`guerr-is-ta`** | 6.0 | `guerr` |
522
+ | whitlockita | **`whitlocki-ta`** | 4.5 | `whitlocki` |
523
+ | assumptionis | **`assumption-is`** | 4.5 | `assumption` |
524
+ | zumacales | **`zumacal-es`** | 4.5 | `zumacal` |
525
+ | raperswilen | **`raperswil-en`** | 4.5 | `raperswil` |
526
+ | antinomies | **`antinomi-es`** | 4.5 | `antinomi` |
527
+ | reglamentaren | **`reglamentar-en`** | 4.5 | `reglamentar` |
528
+ | remarcaria | **`remarcar-ia`** | 4.5 | `remarcar` |
529
+ | reichsfürsten | **`reichsfürst-en`** | 4.5 | `reichsfürst` |
530
+ | deflectores | **`deflector-es`** | 4.5 | `deflector` |
531
+ | produeixen | **`produeix-en`** | 4.5 | `produeix` |
532
+ | autoadjuntes | **`autoadjunt-es`** | 4.5 | `autoadjunt` |
533
+ | subministraria | **`subministrar-ia`** | 4.5 | `subministrar` |
534
+ | barbertonita | **`barbertoni-ta`** | 4.5 | `barbertoni` |
535
+ | balsameres | **`balsamer-es`** | 4.5 | `balsamer` |
536
+
537
+ ### 6.6 Linguistic Interpretation
538
+
539
+ > **Automated Insight:**
540
+ The language Catalan shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
541
+
542
+ ---
543
+ ## 7. Summary & Recommendations
544
 
545
  ![Performance Dashboard](visualizations/performance_dashboard.png)
546
 
 
548
 
549
  | Component | Recommended | Rationale |
550
  |-----------|-------------|-----------|
551
+ | Tokenizer | **64k BPE** | Best compression (4.45x) |
552
+ | N-gram | **2-gram** | Lowest perplexity (262) |
553
+ | Markov | **Context-4** | Highest predictability (87.5%) |
554
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
555
 
556
+
557
  ---
558
  ## Appendix: Metrics Glossary & Interpretation Guide
559
 
 
743
  author = {Kamali, Omar},
744
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
745
  year = {2025},
746
+ doi = {10.5281/zenodo.18073153},
747
+ publisher = {Zenodo},
748
  url = {https://huggingface.co/wikilangs}
749
  institution = {Omneity Labs}
750
  }
 
760
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
761
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
762
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
763
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
764
  ---
765
  *Generated by Wikilangs Models Pipeline*
766
 
767
+ *Report Date: 2026-01-08 03:10:53*
models/embeddings/aligned/ca_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24a1e4df40a3066dc73fa3b9c563fc1f4e8fb90e1aee1858380c3d7bd8a8e8a3
3
+ size 2501686884
models/embeddings/aligned/ca_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ca", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ca_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:802d4468d1f385670eaeb49471e886b448ff5e09a77dcb77154c3337c462986c
3
+ size 65664
models/embeddings/aligned/ca_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ca",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 338419,
7
+ "vocab_size": 1417503
8
+ }
models/embeddings/aligned/ca_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7090d8847bec43a8a16f3caa2e657c0f40a895cd63a132dcd601abb6499dd545
3
+ size 645044580
models/embeddings/aligned/ca_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ca", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ca_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4959f123eccf2b23e091c2a6c906c37ee6fffc5f512fc10722aabe48d584a75
3
+ size 4224
models/embeddings/aligned/ca_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ca",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 338419,
7
+ "vocab_size": 1417503
8
+ }
models/embeddings/aligned/ca_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1880e73b0ed8f1e8ab616846d81db2396794ec49a6ebb244f50960fe73ff693c
3
+ size 1263925348
models/embeddings/aligned/ca_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ca", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ca_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66569971bcebdb0f2bcfc3f5a476d7a05b75dc3989f9453416de8a30ce89ca1c
3
+ size 16512
models/embeddings/aligned/ca_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ca",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 338419,
7
+ "vocab_size": 1417503
8
+ }
models/embeddings/monolingual/ca_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2de1ec6ef15f6007dfa5c9a4c67f73236a818307f9039071d3a4e96ea5f6e7b6
3
- size 2603504009
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24a1e4df40a3066dc73fa3b9c563fc1f4e8fb90e1aee1858380c3d7bd8a8e8a3
3
+ size 2501686884
models/embeddings/monolingual/ca_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1514898
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 1417503
15
  }
models/embeddings/monolingual/ca_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9644da17bbff6c339ba936b958d9b31a96266a293851c8eccad67b458ce4841a
3
- size 672062345
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7090d8847bec43a8a16f3caa2e657c0f40a895cd63a132dcd601abb6499dd545
3
+ size 645044580
models/embeddings/monolingual/ca_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1514898
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 1417503
15
  }
models/embeddings/monolingual/ca_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8f8fb2c25755c94a88eace5de5f1fbd52ec30ff04fbffa01e595b78c516ed32
3
- size 1315876233
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1880e73b0ed8f1e8ab616846d81db2396794ec49a6ebb244f50960fe73ff693c
3
+ size 1263925348
models/embeddings/monolingual/ca_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1514898
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 1417503
15
  }
models/subword_markov/ca_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:270c80664a864c1f71060ea7f8d709b2d841f34520fcae7f80ab36d41715d729
3
- size 1444269
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eff07200d9975dea253fe43de601f122185e6a96985c26c438b63afaaf9acb1
3
+ size 1393281
models/subword_markov/ca_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_contexts": 25162,
6
- "total_transitions": 2374472116
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_contexts": 30691,
6
+ "total_transitions": 2170761675
7
  }
models/subword_markov/ca_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6553292a052ea21a7e04dd2fe1c65f9a324ceb7a03ff2b07b83bf8062abe2647
3
- size 8229347
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca5c5b1e54ae39698e5cacc14cec34cbf702e677bf2e32ec6603b6841603a6fa
3
+ size 6606050
models/subword_markov/ca_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_contexts": 244427,
6
- "total_transitions": 2373684587
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_contexts": 217960,
6
+ "total_transitions": 2169979038
7
  }
models/subword_markov/ca_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fbb799a80b64b1325067d7884080bfc376e7a29f06c3e2867f5a831e0224606
3
- size 33793244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db35fc8ef584d59df2ee3752ca547611bb3f3a2bd606eb85c0dc6fc242d5e672
3
+ size 25408651
models/subword_markov/ca_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_contexts": 1004684,
6
- "total_transitions": 2372897058
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_contexts": 810473,
6
+ "total_transitions": 2169196401
7
  }
models/subword_markov/ca_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f490ba905b17564a560111ab0f1c465f11666745e7a4119fce8f3ba6f168102d
3
- size 126097716
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2396047321998b3d2609f25bf3a10d651f28323cbf9cb2bde8269322c726d3ef
3
+ size 90579198
models/subword_markov/ca_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_contexts": 4363359,
6
- "total_transitions": 2372109529
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_contexts": 3128822,
6
+ "total_transitions": 2168413764
7
  }
models/subword_ngram/ca_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7ab6db3e589628283d59649de17cd75a05bc6be07a068cbbe0464e833f5e1ad
3
- size 693894
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8436fa1ec5955cb730f5246fc40eabfe83d8b9edf3339cb07157ca799560b084
3
+ size 590509
models/subword_ngram/ca_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_ngrams": 50468,
6
- "total_ngrams": 2374472116
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_ngrams": 41609,
6
+ "total_ngrams": 2170761675
7
  }
models/subword_ngram/ca_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ccfc3bdd00e817e25923cd9aa462aec12b25e85ae59752607b8ca8d99cb3b94
3
- size 4601148
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96f568707811cc9639e19837002c41df6be438a465ba2d78ffab6ff9e0b94288
3
+ size 3621792
models/subword_ngram/ca_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_ngrams": 370834,
6
- "total_ngrams": 2373684587
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_ngrams": 288734,
6
+ "total_ngrams": 2169979038
7
  }
models/subword_ngram/ca_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a08ad0ffa43059f8afb0bd3da57b5c77ce0655de29a285e0c08a1e1d5f029d0f
3
- size 28473302
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f51c3b0800498bf443c8ecbf8324c285828c177a16f9e740b0dd747334b77dc4
3
+ size 20638377
models/subword_ngram/ca_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ca",
5
- "unique_ngrams": 2376653,
6
- "total_ngrams": 2372897058
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ca",
5
+ "unique_ngrams": 1676138,
6
+ "total_ngrams": 2169196401
7
  }
models/subword_ngram/ca_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ace21b62f81448899fa83f82e04ede8337828b8fc9fa347420bff131c37d5b8e
3
+ size 74421040
models/subword_ngram/ca_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "ca",
5
+ "unique_ngrams": 6034155,
6
+ "total_ngrams": 2168413764
7
+ }
models/tokenizer/ca_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f01f7af1f8d5c4e98ce3e2b3f156da25a7fc75208e444d2da62aa6bd055eb53c
3
- size 513099
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cedc3c99d8c1bf292ffebd251dd5c3ad9de85c2f453e3e2832994834c283b8
3
+ size 511401
models/tokenizer/ca_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ca_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1ec62b707367a1c81e08267aa8b8e0399165c52209f9fc1b829d24bb692a9a9
3
- size 795646
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b22bc0012e3d4b698e9d9f2caf992c456c9a549b663b1a55d05642a0f73648b
3
+ size 790628
models/tokenizer/ca_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ca_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:377e3e1a65683ec7a5ea17048490d3664d56f4aa83bf05202b915f3b4294c9b5
3
- size 1368059
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88d889048650ea6a5950b07988fd2f76716015504f0d365d9a6ca927d6f5e1da
3
+ size 1358262
models/tokenizer/ca_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ca_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c14f1b4742fb21d6ca5d98cb15380605f889146de927013a491de70b5e12e57b
3
- size 375029
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b638ce04790bb844a2400a620c30328aac4bd967de91839673382a6537634870
3
+ size 374334
models/tokenizer/ca_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/ca_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93e0c0ee1b4d1a2b23217b790aa4baa87458d5f7ff2b006079314bff72875c6a
3
- size 14885768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b996ac5d5b1626b076abdc107b305b912677f3186fb9bcc303dcc216d8153a3e
3
+ size 22533215
models/vocabulary/ca_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "ca",
3
- "vocabulary_size": 1000000,
 
4
  "statistics": {
5
- "type_token_ratio": 0.012118454476561086,
6
  "coverage": {
7
- "top_100": 0.43113921935660243,
8
- "top_1000": 0.6237288116322734,
9
- "top_5000": 0.7756172262594092,
10
- "top_10000": 0.8329204449496337
11
  },
12
- "hapax_count": 3103831,
13
- "hapax_ratio": 0.6414158123806192,
14
- "total_documents": 787529
15
  }
16
  }
 
1
  {
2
  "language": "ca",
3
+ "vocabulary_size": 1490582,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.0088238088631879,
7
  "coverage": {
8
+ "top_100": 0.4481618941219131,
9
+ "top_1000": 0.6351444736313012,
10
+ "top_5000": 0.7810939031610028,
11
+ "top_10000": 0.8379540286860089
12
  },
13
+ "hapax_count": 1809890,
14
+ "hapax_ratio": 0.5483730811835398,
15
+ "total_documents": 782637
16
  }
17
  }
models/vocabulary/ca_vocabulary_top.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc7c0f73dc675fec3ea36f344213a6fef18fac8c3db6daba687f6e5b5ce59583
3
+ size 15047767
models/vocabulary/ca_vocabulary_top_metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ca",
3
+ "vocabulary_size": 1000000,
4
+ "variant": "top",
5
+ "statistics": {
6
+ "type_token_ratio": 0.0088238088631879,
7
+ "coverage": {
8
+ "top_100": 0.4481618941219131,
9
+ "top_1000": 0.6351444736313012,
10
+ "top_5000": 0.7810939031610028,
11
+ "top_10000": 0.8379540286860089
12
+ },
13
+ "hapax_count": 1809890,
14
+ "hapax_ratio": 0.5483730811835398,
15
+ "total_documents": 782637,
16
+ "top_vocab_size": 1000000,
17
+ "coverage_ratio": 0.9923944164431507,
18
+ "tokens_excluded": 490582
19
+ }
20
+ }
models/word_markov/ca_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07bdda626dbfe973038eb25a9dc10c1512fdb8805dfb1c42748d0c9780c2d6bc
3
- size 361692175
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96435b63dd567ae47be848bf59b2c094743cf70cb0fef7e47e6b333c23332b3a
3
+ size 388066360
models/word_markov/ca_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ca",
5
- "unique_contexts": 4839481,
6
- "total_transitions": 490797423
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ca",
5
+ "unique_contexts": 3298751,
6
+ "total_transitions": 373259010
7
  }