omarkamali commited on
Commit
f62c63e
·
verified ·
1 Parent(s): 61198cd

Upload all models and assets for cs (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +341 -141
  3. models/embeddings/aligned/cs_128d.bin +3 -0
  4. models/embeddings/aligned/cs_128d.meta.json +1 -0
  5. models/embeddings/aligned/cs_128d.projection.npy +3 -0
  6. models/embeddings/aligned/cs_128d_metadata.json +8 -0
  7. models/embeddings/aligned/cs_32d.bin +3 -0
  8. models/embeddings/aligned/cs_32d.meta.json +1 -0
  9. models/embeddings/aligned/cs_32d.projection.npy +3 -0
  10. models/embeddings/aligned/cs_32d_metadata.json +8 -0
  11. models/embeddings/aligned/cs_64d.bin +3 -0
  12. models/embeddings/aligned/cs_64d.meta.json +1 -0
  13. models/embeddings/aligned/cs_64d.projection.npy +3 -0
  14. models/embeddings/aligned/cs_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/cs_128d.bin +2 -2
  16. models/embeddings/monolingual/cs_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/cs_32d.bin +2 -2
  18. models/embeddings/monolingual/cs_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/cs_64d.bin +2 -2
  20. models/embeddings/monolingual/cs_64d_metadata.json +5 -3
  21. models/subword_markov/cs_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/cs_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/cs_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/cs_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/cs_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/cs_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/cs_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/cs_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/cs_2gram_subword.parquet +2 -2
  30. models/subword_ngram/cs_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/cs_3gram_subword.parquet +2 -2
  32. models/subword_ngram/cs_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/cs_4gram_subword.parquet +2 -2
  34. models/subword_ngram/cs_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/cs_5gram_subword.parquet +3 -0
  36. models/subword_ngram/cs_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/cs_tokenizer_16k.model +2 -2
  38. models/tokenizer/cs_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/cs_tokenizer_32k.model +2 -2
  40. models/tokenizer/cs_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/cs_tokenizer_64k.model +2 -2
  42. models/tokenizer/cs_tokenizer_64k.vocab +0 -0
  43. models/tokenizer/cs_tokenizer_8k.model +2 -2
  44. models/tokenizer/cs_tokenizer_8k.vocab +0 -0
  45. models/vocabulary/cs_vocabulary.parquet +2 -2
  46. models/vocabulary/cs_vocabulary_metadata.json +10 -9
  47. models/vocabulary/cs_vocabulary_top.parquet +3 -0
  48. models/vocabulary/cs_vocabulary_top_metadata.json +20 -0
  49. models/word_markov/cs_markov_ctx1_word.parquet +2 -2
  50. models/word_markov/cs_markov_ctx1_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-slavic_west
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,14 +33,14 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.130
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.7709
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 1000000
33
- generated: 2025-12-29
34
  ---
35
 
36
  # Czech - Wikilangs Models
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,57 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.197x | 3.17 | 0.1005% | 3,306,013 |
76
- | **16k** | 3.544x | 3.51 | 0.1114% | 2,981,907 |
77
- | **32k** | 3.862x | 3.82 | 0.1214% | 2,736,410 |
78
- | **64k** | 4.130x 🏆 | 4.09 | 0.1298% | 2,558,883 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `TJ Sigma Olomouc je historický název těchto klubů:
85
- SK Sigma Olomouc – fotbalo...`
86
 
87
  | Vocab | Tokens | Count |
88
  |-------|--------|-------|
89
- | 8k | `▁tj ▁si g ma m žolomoucje ▁historickýnázev ... (+31 more)` | 41 |
90
- | 16k | `▁tj ▁sig mam žolomoucjehistorickýnázevtěchto ... (+25 more)` | 35 |
91
- | 32k | `▁tj ▁sigmam žolomoucjehistorickýnázevtěchto ▁klubů ... (+21 more)` | 31 |
92
- | 64k | `▁tj ▁sigmam žolomoucjehistorickýnázevtěchto ▁klubů ... (+20 more)` | 30 |
93
 
94
- **Sample 2:** `PŘESMĚRUJ Chrpa čekánek
95
-
96
- Kategorie:Přesměrování z vědeckého jména`
97
 
98
  | Vocab | Tokens | Count |
99
  |-------|--------|-------|
100
- | 8k | `▁přes měru j ▁ch r pa ▁če nek ▁kategorie ... (+9 more)` | 19 |
101
- | 16k | `▁přes měru j ▁ch r pa ▁če nek ▁kategorie ... (+8 more)` | 18 |
102
- | 32k | `▁přes měru j ▁chr pa ▁čeká nek ▁kategorie : přes ... (+5 more)` | 15 |
103
- | 64k | `▁přes měru j ▁chr pa ▁čeká nek ▁kategorie : přes ... (+5 more)` | 15 |
104
-
105
- **Sample 3:** `Pello jsou dvě obce se stejným názvem:
106
 
107
- Pello (Finsko) – obec ve Finsku
108
- Pello ...`
109
 
110
  | Vocab | Tokens | Count |
111
  |-------|--------|-------|
112
- | 8k | `▁pel lojsoudvě ▁obcese ▁stej nýmnázvem : ... (+23 more)` | 33 |
113
- | 16k | `▁pel lojsoudvě ▁obcese ▁stejnýmnázvem : ▁pel ... (+19 more)` | 29 |
114
- | 32k | `▁pel lojsoudvě ▁obcese ▁stejnýmnázvem : pel ... (+19 more)` | 29 |
115
- | 64k | `▁pel lojsoudvě ▁obcese ▁stejnýmnázvem : pel ... (+17 more)` | 27 |
116
 
117
 
118
  ### Key Findings
119
 
120
- - **Best Compression:** 64k achieves 4.130x compression
121
- - **Lowest UNK Rate:** 8k with 0.1005% unknown tokens
122
  - **Trade-off:** Larger vocabularies improve compression but increase model size
123
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
124
 
@@ -127,57 +139,111 @@ Kategorie:Přesměrování z vědeckého jména`
127
 
128
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
129
 
 
 
130
  ![N-gram Coverage](visualizations/ngram_coverage.png)
131
 
132
  ### Results
133
 
134
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
135
- |--------|------------|---------|----------------|------------------|-------------------|
136
- | **2-gram** | 295,975 🏆 | 18.18 | 5,487,215 | 10.4% | 21.2% |
137
- | **2-gram** | 540 🏆 | 9.08 | 35,551 | 50.3% | 96.3% |
138
- | **3-gram** | 1,394,157 | 20.41 | 12,698,712 | 5.9% | 11.9% |
139
- | **3-gram** | 5,799 | 12.50 | 338,538 | 15.9% | 51.3% |
140
- | **4-gram** | 3,715,211 | 21.83 | 23,000,459 | 5.1% | 8.9% |
141
- | **4-gram** | 39,713 | 15.28 | 2,482,399 | 7.1% | 24.3% |
 
 
142
 
143
  ### Top 5 N-grams by Size
144
 
145
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  | Rank | N-gram | Count |
148
  |------|--------|-------|
149
- | 1 | `kategorie :` | 2,810,345 |
150
- | 2 | `| |` | 1,600,369 |
151
- | 3 | `v roce` | 1,324,349 |
152
- | 4 | `. v` | 1,082,474 |
153
- | 5 | `) ,` | 872,870 |
154
 
155
- **3-grams:**
156
 
157
  | Rank | N-gram | Count |
158
  |------|--------|-------|
159
- | 1 | `| align =` | 701,748 |
160
- | 2 | `| | align` | 686,964 |
161
- | 3 | `align = right` | 683,296 |
162
- | 4 | `= right |` | 683,269 |
163
- | 5 | `kategorie : narození` | 390,185 |
164
 
165
- **4-grams:**
166
 
167
  | Rank | N-gram | Count |
168
  |------|--------|-------|
169
- | 1 | `| | align =` | 686,964 |
170
- | 2 | `| align = right` | 683,285 |
171
- | 3 | `align = right |` | 683,268 |
172
- | 4 | `kategorie : narození v` | 246,211 |
173
- | 5 | `externí odkazy kategorie :` | 191,110 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
 
176
  ### Key Findings
177
 
178
- - **Best Perplexity:** 2-gram with 540
179
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
180
- - **Coverage:** Top-1000 patterns cover ~24% of corpus
181
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
182
 
183
  ---
@@ -185,55 +251,86 @@ Kategorie:Přesměrování z vědeckého jména`
185
 
186
  ![Markov Entropy](visualizations/markov_entropy.png)
187
 
 
 
188
  ![Markov Branching](visualizations/markov_branching.png)
189
 
190
  ### Results
191
 
192
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
193
- |---------|-------------|------------|------------------|-----------------|----------------|
194
- | **1** | 0.7467 | 1.678 | 10.64 | 5,180,358 | 25.3% |
195
- | **1** | 1.5920 | 3.015 | 12.43 | 10,644 | 0.0% |
196
- | **2** | 0.4222 | 1.340 | 2.71 | 55,087,986 | 57.8% |
197
- | **2** | 0.7625 | 1.696 | 5.46 | 132,235 | 23.8% |
198
- | **3** | 0.1844 | 1.136 | 1.46 | 149,279,731 | 81.6% |
199
- | **3** | 0.9138 | 1.884 | 5.69 | 722,190 | 8.6% |
200
- | **4** | 0.0848 🏆 | 1.061 | 1.17 | 217,279,300 | 91.5% |
201
- | **4** | 0.8187 🏆 | 1.764 | 4.11 | 4,107,972 | 18.1% |
202
 
203
- ### Generated Text Samples
204
 
205
- Below are text samples generated from each Markov chain model:
206
 
207
  **Context Size 1:**
208
 
209
- 1. `. a jeho šířením civilizace původní zahrady . předpokládá se tak , který pracoval v nemocnici`
210
- 2. `, ale ona . curtiss c v modelové animace ) . sk . y mhz )`
211
- 3. `v maďarské metropole vojvodiny . pupeny se používalo pro osvětovou činností policie ontario : oceněn...`
212
 
213
  **Context Size 2:**
214
 
215
- 1. `kategorie : muži kategorie : francouzští fotbalisté kategorie : úmrtí v roce 1884 kategorie : čeští ...`
216
- 2. `| | align = right | 2 | 13 , resurrection ( 1993 - 2000 elke delugan`
217
- 3. `v roce 2016 uskutečněno referendum ve spojeném království . dne 22 . března 1993li pcheng ili pcheng`
218
 
219
  **Context Size 3:**
220
 
221
- 1. `| align = right | 7 , 74 | | align = right | 4 , 366 |`
222
- 2. `| | align = right | 3 , 506 | | mba | | 8 | | 14`
223
- 3. `align = right | 9 , 09 | | align = right | 0 , 072 | |`
224
 
225
  **Context Size 4:**
226
 
227
- 1. `| | align = right | 2 , 615 | | align = right | 2 , 697 |`
228
- 2. `| align = right | 2 , 610 | | mba | | 6 . května 2000 | |`
229
- 3. `align = right | 0 , 199 | | mba | | 29 . července 2000 | | socorro`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
 
232
  ### Key Findings
233
 
234
- - **Best Predictability:** Context-4 with 91.5% predictability
235
  - **Branching Factor:** Decreases with context size (more deterministic)
236
- - **Memory Trade-off:** Larger contexts require more storage (4,107,972 contexts)
237
  - **Recommendation:** Context-3 or Context-4 for text generation
238
 
239
  ---
@@ -249,64 +346,64 @@ Below are text samples generated from each Markov chain model:
249
 
250
  | Metric | Value |
251
  |--------|-------|
252
- | Vocabulary Size | 1,000,000 |
253
- | Total Tokens | 260,225,789 |
254
- | Mean Frequency | 260.23 |
255
- | Median Frequency | 14 |
256
- | Frequency Std Dev | 13208.48 |
257
 
258
  ### Most Common Words
259
 
260
  | Rank | Word | Frequency |
261
  |------|------|-----------|
262
- | 1 | v | 7,430,819 |
263
- | 2 | a | 6,664,349 |
264
- | 3 | na | 3,549,580 |
265
- | 4 | se | 3,405,018 |
266
- | 5 | kategorie | 2,831,795 |
267
- | 6 | je | 2,114,934 |
268
- | 7 | s | 1,854,182 |
269
- | 8 | z | 1,755,717 |
270
- | 9 | do | 1,449,955 |
271
- | 10 | roce | 1,387,784 |
272
 
273
  ### Least Common Words (from vocabulary)
274
 
275
  | Rank | Word | Frequency |
276
  |------|------|-----------|
277
- | 1 | čechovovu | 4 |
278
- | 2 | vychloubavý | 4 |
279
- | 3 | euroatlantický | 4 |
280
- | 4 | postkomunistickým | 4 |
281
- | 5 | nadnesli | 4 |
282
- | 6 | sulyoka | 4 |
283
- | 7 | srebrenickém | 4 |
284
- | 8 | odepřený | 4 |
285
- | 9 | kosovskoalbánských | 4 |
286
- | 10 | ipap | 4 |
287
 
288
  ### Zipf's Law Analysis
289
 
290
  | Metric | Value |
291
  |--------|-------|
292
- | Zipf Coefficient | 0.9367 |
293
- | R² (Goodness of Fit) | 0.997046 |
294
  | Adherence Quality | **excellent** |
295
 
296
  ### Coverage Analysis
297
 
298
  | Top N Words | Coverage |
299
  |-------------|----------|
300
- | Top 100 | 27.3% |
301
- | Top 1,000 | 47.0% |
302
- | Top 5,000 | 64.5% |
303
- | Top 10,000 | 72.1% |
304
 
305
  ### Key Findings
306
 
307
- - **Zipf Compliance:** R²=0.9970 indicates excellent adherence to Zipf's law
308
- - **High Frequency Dominance:** Top 100 words cover 27.3% of corpus
309
- - **Long Tail:** 990,000 words needed for remaining 27.9% coverage
310
 
311
  ---
312
  ## 5. Word Embeddings Evaluation
@@ -319,24 +416,124 @@ Below are text samples generated from each Markov chain model:
319
 
320
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
321
 
322
- ### Model Comparison
323
 
324
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
325
- |-------|------------|-----------|----------|----------|----------|
326
- | **mono_32d** | 1,659,956 | 32 | 3.229 | 0.994 | 0.7709 🏆 |
327
- | **mono_64d** | 1,659,956 | 64 | 3.639 | 0.952 | 0.7533 |
328
- | **mono_128d** | 1,659,956 | 128 | 4.126 | 0.969 | 0.7045 |
329
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  ### Key Findings
332
 
333
- - **Best Isotropy:** mono_32d with 0.7709 (more uniform distribution)
334
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
335
- - **Vocabulary Coverage:** All models cover 1,659,956 words
336
- - **Recommendation:** 100d for balanced semantic capture and efficiency
337
 
338
  ---
339
- ## 6. Summary & Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  ![Performance Dashboard](visualizations/performance_dashboard.png)
342
 
@@ -344,11 +541,12 @@ Below are text samples generated from each Markov chain model:
344
 
345
  | Component | Recommended | Rationale |
346
  |-----------|-------------|-----------|
347
- | Tokenizer | **32k BPE** | Best compression (4.13x) with low UNK rate |
348
- | N-gram | **5-gram** | Lowest perplexity (540) |
349
- | Markov | **Context-4** | Highest predictability (91.5%) |
350
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
351
 
 
352
  ---
353
  ## Appendix: Metrics Glossary & Interpretation Guide
354
 
@@ -538,7 +736,8 @@ If you use these models in your research, please cite:
538
  author = {Kamali, Omar},
539
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
540
  year = {2025},
541
- publisher = {HuggingFace},
 
542
  url = {https://huggingface.co/wikilangs}
543
  institution = {Omneity Labs}
544
  }
@@ -554,7 +753,8 @@ MIT License - Free for academic and commercial use.
554
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
555
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
556
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
557
  ---
558
  *Generated by Wikilangs Models Pipeline*
559
 
560
- *Report Date: 2025-12-29 05:36:16*
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-slavic_west
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.591
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.7988
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-08
44
  ---
45
 
46
  # Czech - Wikilangs Models
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.417x | 3.42 | 0.0769% | 2,893,388 |
94
+ | **16k** | 3.845x | 3.85 | 0.0865% | 2,570,989 |
95
+ | **32k** | 4.245x | 4.25 | 0.0955% | 2,328,840 |
96
+ | **64k** | 4.591x 🏆 | 4.59 | 0.1033% | 2,153,192 |
97
 
98
  ### Tokenization Examples
99
 
100
  Below are sample sentences tokenized with each vocabulary size:
101
 
102
+ **Sample 1:** `<tr> Související články Seznam kulturních památek v okrese Znojmo Externí odkazy...`
 
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
+ | 8k | `▁< tr >související ▁článkyseznamkultur níchpam átek ... (+17 more)` | 27 |
107
+ | 16k | `▁< tr >související ▁článkyseznamkulturníchpamátekvokrese ... (+13 more)` | 23 |
108
+ | 32k | `▁< tr > související ▁článkyseznamkulturníchpamátekvokrese ... (+11 more)` | 21 |
109
+ | 64k | `▁< tr > související ▁článkyseznamkulturníchpamátekvokrese ... (+11 more)` | 21 |
110
 
111
+ **Sample 2:** `Mirovice <tr> Sochovice <tr> Související články Seznam kulturních památek v okre...`
 
 
112
 
113
  | Vocab | Tokens | Count |
114
  |-------|--------|-------|
115
+ | 8k | `▁mi rovice ▁< tr > ▁so ch ovice ▁< tr ... (+17 more)` | 27 |
116
+ | 16k | `▁mi rovice ▁< tr > ▁so chovice ▁< tr > ... (+14 more)` | 24 |
117
+ | 32k | `▁mi rovice ▁< tr > ▁so chovice ▁< tr > ... (+14 more)` | 24 |
118
+ | 64k | `▁mi rovice ▁< tr > ▁so chovice ▁< tr > ... (+14 more)` | 24 |
 
 
119
 
120
+ **Sample 3:** `Sabra může být: sabra – hebrejské slovo Sabra (tank) Sabra sídlo v Libanonu, d...`
 
121
 
122
  | Vocab | Tokens | Count |
123
  |-------|--------|-------|
124
+ | 8k | `▁sa bramůžebýt :sa bra ▁–hebrej ské ... (+22 more)` | 32 |
125
+ | 16k | `▁sa bramůžebýt :sa bra ▁– hebrej ské ... (+21 more)` | 31 |
126
+ | 32k | `▁sa bramůžebýt :sa bra ▁– hebrejskéslovo ... (+17 more)` | 27 |
127
+ | 64k | `▁sa bramůžebýt :sa bra ▁– hebrejskéslovo ... (+15 more)` | 25 |
128
 
129
 
130
  ### Key Findings
131
 
132
+ - **Best Compression:** 64k achieves 4.591x compression
133
+ - **Lowest UNK Rate:** 8k with 0.0769% unknown tokens
134
  - **Trade-off:** Larger vocabularies improve compression but increase model size
135
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
136
 
 
139
 
140
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
141
 
142
+ ![N-gram Unique](visualizations/ngram_unique.png)
143
+
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
150
+ | **2-gram** | Word | 644,039 | 19.30 | 4,952,358 | 4.8% | 11.9% |
151
+ | **2-gram** | Subword | 449 🏆 | 8.81 | 30,223 | 53.9% | 98.0% |
152
+ | **3-gram** | Word | 2,339,059 | 21.16 | 8,925,525 | 2.6% | 6.4% |
153
+ | **3-gram** | Subword | 4,755 | 12.22 | 255,109 | 16.7% | 54.3% |
154
+ | **4-gram** | Word | 5,475,376 | 22.38 | 14,408,434 | 1.3% | 3.9% |
155
+ | **4-gram** | Subword | 32,796 | 15.00 | 1,646,964 | 6.8% | 24.8% |
156
+ | **5-gram** | Word | 4,645,198 | 22.15 | 10,221,820 | 1.0% | 3.6% |
157
+ | **5-gram** | Subword | 160,592 | 17.29 | 6,437,902 | 3.7% | 13.8% |
158
 
159
  ### Top 5 N-grams by Size
160
 
161
+ **2-grams (Word):**
162
+
163
+ | Rank | N-gram | Count |
164
+ |------|--------|-------|
165
+ | 1 | `v roce` | 1,319,715 |
166
+ | 2 | `externí odkazy` | 445,741 |
167
+ | 3 | `odkazy reference` | 238,320 |
168
+ | 4 | `reference externí` | 226,335 |
169
+ | 5 | `v letech` | 212,278 |
170
+
171
+ **3-grams (Word):**
172
+
173
+ | Rank | N-gram | Count |
174
+ |------|--------|-------|
175
+ | 1 | `reference externí odkazy` | 226,294 |
176
+ | 2 | `odkazy reference externí` | 124,877 |
177
+ | 3 | `v roce v` | 123,855 |
178
+ | 4 | `v roce se` | 91,582 |
179
+ | 5 | `v roce byl` | 64,824 |
180
+
181
+ **4-grams (Word):**
182
 
183
  | Rank | N-gram | Count |
184
  |------|--------|-------|
185
+ | 1 | `odkazy reference externí odkazy` | 124,850 |
186
+ | 2 | `odkazy reference související články` | 42,127 |
187
+ | 3 | `v roce v roce` | 34,075 |
188
+ | 4 | `reference externí odkazy v` | 29,798 |
189
+ | 5 | `externí odkazy oficiální stránky` | 20,103 |
190
 
191
+ **5-grams (Word):**
192
 
193
  | Rank | N-gram | Count |
194
  |------|--------|-------|
195
+ | 1 | `odkazy reference externí odkazy v` | 16,236 |
196
+ | 2 | `odkazy reference literatura externí odkazy` | 12,685 |
197
+ | 3 | `reference externí odkazy oficiální stránky` | 11,834 |
198
+ | 4 | `historie první písemná zmínka o` | 11,754 |
199
+ | 5 | `reference externí odkazy v okrese` | 11,425 |
200
 
201
+ **2-grams (Subword):**
202
 
203
  | Rank | N-gram | Count |
204
  |------|--------|-------|
205
+ | 1 | `a _` | 24,781,439 |
206
+ | 2 | `_ p` | 22,589,509 |
207
+ | 3 | `e _` | 22,268,109 |
208
+ | 4 | `_ s` | 22,095,879 |
209
+ | 5 | `_ v` | 19,926,387 |
210
+
211
+ **3-grams (Subword):**
212
+
213
+ | Rank | N-gram | Count |
214
+ |------|--------|-------|
215
+ | 1 | `n í _` | 7,673,842 |
216
+ | 2 | `_ p o` | 7,582,650 |
217
+ | 3 | `_ v _` | 7,272,309 |
218
+ | 4 | `n a _` | 6,690,107 |
219
+ | 5 | `_ a _` | 6,501,417 |
220
+
221
+ **4-grams (Subword):**
222
+
223
+ | Rank | N-gram | Count |
224
+ |------|--------|-------|
225
+ | 1 | `_ n a _` | 3,511,209 |
226
+ | 2 | `_ s e _` | 3,364,693 |
227
+ | 3 | `_ p r o` | 3,186,267 |
228
+ | 4 | `_ b y l` | 2,542,448 |
229
+ | 5 | `ý c h _` | 2,252,305 |
230
+
231
+ **5-grams (Subword):**
232
+
233
+ | Rank | N-gram | Count |
234
+ |------|--------|-------|
235
+ | 1 | `_ k t e r` | 1,412,346 |
236
+ | 2 | `_ r o c e` | 1,383,042 |
237
+ | 3 | `_ v _ r o` | 1,382,611 |
238
+ | 4 | `r o c e _` | 1,354,432 |
239
+ | 5 | `v _ r o c` | 1,321,210 |
240
 
241
 
242
  ### Key Findings
243
 
244
+ - **Best Perplexity:** 2-gram (subword) with 449
245
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
246
+ - **Coverage:** Top-1000 patterns cover ~14% of corpus
247
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
248
 
249
  ---
 
251
 
252
  ![Markov Entropy](visualizations/markov_entropy.png)
253
 
254
+ ![Markov Contexts](visualizations/markov_contexts.png)
255
+
256
  ![Markov Branching](visualizations/markov_branching.png)
257
 
258
  ### Results
259
 
260
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
261
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
262
+ | **1** | Word | 1.0698 | 2.099 | 16.20 | 3,817,910 | 0.0% |
263
+ | **1** | Subword | 1.2123 | 2.317 | 8.62 | 14,369 | 0.0% |
264
+ | **2** | Word | 0.3832 | 1.304 | 2.35 | 61,779,051 | 61.7% |
265
+ | **2** | Subword | 0.6716 | 1.593 | 4.71 | 123,767 | 32.8% |
266
+ | **3** | Word | 0.1433 | 1.104 | 1.31 | 144,949,424 | 85.7% |
267
+ | **3** | Subword | 0.7660 | 1.701 | 4.77 | 583,275 | 23.4% |
268
+ | **4** | Word | 0.0564 🏆 | 1.040 | 1.10 | 189,649,924 | 94.4% |
269
+ | **4** | Subword | 0.7409 | 1.671 | 4.00 | 2,782,368 | 25.9% |
270
 
271
+ ### Generated Text Samples (Word-based)
272
 
273
+ Below are text samples generated from each word-based Markov chain model:
274
 
275
  **Context Size 1:**
276
 
277
+ 1. `v podobě vystavěn byl opětovně pohřbena ve dveřích některých případech může vytvořit jediné dopravní...`
278
+ 2. `a příslušník staré město zbiroh živa je americký teoretický kvantový stav potrvá v létě odešel na`
279
+ 3. `na fakt že neměl v červenci i z původních 113 120 metrů vysokém tlaku na východě`
280
 
281
  **Context Size 2:**
282
 
283
+ 1. `v roce lidé 6 prosince praha byl michal kraus čssd čssd 48 rychnov nad kněžnou kaple stojí`
284
+ 2. `externí odkazy jihovýchodní evropy jihozápadní asie kavkazu číny sibiře východní asie hustě chlupatá...`
285
+ 3. `odkazy reference externí odkazy sdružení na praze 4 rozhovor vznikl v roce kde bojoval proti ostrogó...`
286
 
287
  **Context Size 3:**
288
 
289
+ 1. `reference externí odkazy v ternopilské oblasti na řece strypa v historickém regionu horní lužice mim...`
290
+ 2. `odkazy reference externí odkazy speleologická společnost vševěd romantismu hudební skladatelé klavír...`
291
+ 3. `v roce v angličtině se pro celou skupinu alfred crompton catherine musinsky jose bonaparte bhart anj...`
292
 
293
  **Context Size 4:**
294
 
295
+ 1. `odkazy reference externí odkazy strategie série`
296
+ 2. `odkazy reference související články fotografie v norsku externí odkazy na seznamu světového dědictví...`
297
+ 3. `v roce v roce v praze pilotní školu druhá světová válka po roce vojenské služby v polské armádě prot...`
298
+
299
+
300
+ ### Generated Text Samples (Subword-based)
301
+
302
+ Below are text samples generated from each subword-based Markov chain model:
303
+
304
+ **Context Size 1:**
305
+
306
+ 1. `_hraloponodovo._`
307
+ 2. `os_zu_va_vu_dulo`
308
+ 3. `ekodici_micl_v_s`
309
+
310
+ **Context Size 2:**
311
+
312
+ 1. `a_stříjna_se_rozh`
313
+ 2. `_příčku_uraven_pe`
314
+ 3. `e_na_vítlická_hov`
315
+
316
+ **Context Size 3:**
317
+
318
+ 1. `ní_nejčastoru_o_sp`
319
+ 2. `_polik_v_com_trans`
320
+ 3. `_v_195_zúčasná_náz`
321
+
322
+ **Context Size 4:**
323
+
324
+ 1. `_na_v_nicméně_chlaz`
325
+ 2. `_se_proje_asistenci`
326
+ 3. `_pro_pozdně,_lze_sa`
327
 
328
 
329
  ### Key Findings
330
 
331
+ - **Best Predictability:** Context-4 (word) with 94.4% predictability
332
  - **Branching Factor:** Decreases with context size (more deterministic)
333
+ - **Memory Trade-off:** Larger contexts require more storage (2,782,368 contexts)
334
  - **Recommendation:** Context-3 or Context-4 for text generation
335
 
336
  ---
 
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
+ | Vocabulary Size | 1,830,714 |
350
+ | Total Tokens | 237,612,209 |
351
+ | Mean Frequency | 129.79 |
352
+ | Median Frequency | 5 |
353
+ | Frequency Std Dev | 9362.17 |
354
 
355
  ### Most Common Words
356
 
357
  | Rank | Word | Frequency |
358
  |------|------|-----------|
359
+ | 1 | v | 7,396,110 |
360
+ | 2 | a | 6,633,731 |
361
+ | 3 | na | 3,536,561 |
362
+ | 4 | se | 3,396,490 |
363
+ | 5 | je | 2,110,163 |
364
+ | 6 | s | 1,781,636 |
365
+ | 7 | z | 1,747,028 |
366
+ | 8 | do | 1,440,810 |
367
+ | 9 | roce | 1,383,007 |
368
+ | 10 | ve | 1,284,897 |
369
 
370
  ### Least Common Words (from vocabulary)
371
 
372
  | Rank | Word | Frequency |
373
  |------|------|-----------|
374
+ | 1 | mihty | 2 |
375
+ | 2 | socionaut | 2 |
376
+ | 3 | mafjar | 2 |
377
+ | 4 | vlta | 2 |
378
+ | 5 | havlátková | 2 |
379
+ | 6 | makbúsu | 2 |
380
+ | 7 | propfanů | 2 |
381
+ | 8 | propfanu | 2 |
382
+ | 9 | ochmeloff | 2 |
383
+ | 10 | luncași | 2 |
384
 
385
  ### Zipf's Law Analysis
386
 
387
  | Metric | Value |
388
  |--------|-------|
389
+ | Zipf Coefficient | 0.9138 |
390
+ | R² (Goodness of Fit) | 0.997539 |
391
  | Adherence Quality | **excellent** |
392
 
393
  ### Coverage Analysis
394
 
395
  | Top N Words | Coverage |
396
  |-------------|----------|
397
+ | Top 100 | 27.1% |
398
+ | Top 1,000 | 45.7% |
399
+ | Top 5,000 | 63.0% |
400
+ | Top 10,000 | 70.6% |
401
 
402
  ### Key Findings
403
 
404
+ - **Zipf Compliance:** R²=0.9975 indicates excellent adherence to Zipf's law
405
+ - **High Frequency Dominance:** Top 100 words cover 27.1% of corpus
406
+ - **Long Tail:** 1,820,714 words needed for remaining 29.4% coverage
407
 
408
  ---
409
  ## 5. Word Embeddings Evaluation
 
416
 
417
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
418
 
 
419
 
420
+ ### 5.1 Cross-Lingual Alignment
421
+
422
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
423
+
424
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
425
+
426
+
427
+ ### 5.2 Model Comparison
428
+
429
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
430
+ |-------|-----------|----------|------------------|---------------|----------------|
431
+ | **mono_32d** | 32 | 0.7988 | 0.3622 | N/A | N/A |
432
+ | **mono_64d** | 64 | 0.7835 | 0.2893 | N/A | N/A |
433
+ | **mono_128d** | 128 | 0.7363 | 0.2299 | N/A | N/A |
434
+ | **aligned_32d** | 32 | 0.7988 🏆 | 0.3646 | 0.3500 | 0.7360 |
435
+ | **aligned_64d** | 64 | 0.7835 | 0.2898 | 0.5900 | 0.8980 |
436
+ | **aligned_128d** | 128 | 0.7363 | 0.2271 | 0.7320 | 0.9520 |
437
 
438
  ### Key Findings
439
 
440
+ - **Best Isotropy:** aligned_32d with 0.7988 (more uniform distribution)
441
+ - **Semantic Density:** Average pairwise similarity of 0.2938. Lower values indicate better semantic separation.
442
+ - **Alignment Quality:** Aligned models achieve up to 73.2% R@1 in cross-lingual retrieval.
443
+ - **Recommendation:** 128d aligned for best cross-lingual performance
444
 
445
  ---
446
+ ## 6. Morphological Analysis (Experimental)
447
+
448
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
449
+
450
+ ### 6.1 Productivity & Complexity
451
+
452
+ | Metric | Value | Interpretation | Recommendation |
453
+ |--------|-------|----------------|----------------|
454
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
455
+ | Idiomaticity Gap | **-0.741** | Low formulaic content | - |
456
+
457
+ ### 6.2 Affix Inventory (Productive Units)
458
+
459
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
460
+
461
+ #### Productive Prefixes
462
+ | Prefix | Examples |
463
+ |--------|----------|
464
+ | `-ne` | nezamítl, neomorf, nenapájeným |
465
+ | `-po` | poštulky, ponoršťování, powerkiting |
466
+
467
+ #### Productive Suffixes
468
+ | Suffix | Examples |
469
+ |--------|----------|
470
+ | `-em` | charmsem, treitschkem, holtem |
471
+ | `-ch` | orbitalech, lekebusch, sklízených |
472
+ | `-ho` | vladivostockého, sertoliho, cenokarpního |
473
+ | `-ou` | hobgarskou, výfukovou, robotou |
474
+
475
+ ### 6.3 Bound Stems (Lexical Roots)
476
+
477
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
478
+
479
+ | Stem | Cohesion | Substitutability | Examples |
480
+ |------|----------|------------------|----------|
481
+ | `ovýc` | 2.16x | 487 contexts | ových, xových, nových |
482
+ | `skéh` | 2.15x | 392 contexts | ského, lského, urského |
483
+ | `skýc` | 1.97x | 237 contexts | ských, skýcov, tských |
484
+ | `ický` | 1.57x | 496 contexts | tický, bický, úpický |
485
+ | `nské` | 1.53x | 491 contexts | anské, inské, ínské |
486
+ | `ován` | 1.44x | 594 contexts | ování, kován, zování |
487
+ | `ické` | 1.46x | 499 contexts | tické, lické, mické |
488
+ | `ledn` | 1.59x | 250 contexts | lednu, ledna, ledný |
489
+ | `itel` | 1.36x | 634 contexts | nitel, litel, pitel |
490
+ | `cház` | 1.52x | 287 contexts | chází, schází, ochází |
491
+ | `dkaz` | 2.66x | 23 contexts | odkaz, odkaze, odkazy |
492
+ | `xter` | 1.81x | 76 contexts | exter, xterm, extern |
493
+
494
+ ### 6.4 Affix Compatibility (Co-occurrence)
495
+
496
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
497
+
498
+ | Prefix | Suffix | Frequency | Examples |
499
+ |--------|--------|-----------|----------|
500
+ | `-ne` | `-ch` | 14 words | nepropouštějících, netermínovaných |
501
+ | `-ne` | `-ho` | 10 words | nejpokročilejšího, nezpochybnitelného |
502
+ | `-ne` | `-ou` | 9 words | nestejnou, nerozšiřitelnou |
503
+ | `-po` | `-ho` | 9 words | podmínkového, polštářovitého |
504
+ | `-po` | `-ch` | 7 words | pohodlnějších, polohovkách |
505
+ | `-po` | `-ou` | 6 words | ponitranskou, pomátnou |
506
+ | `-po` | `-em` | 3 words | pollackem, povříslem |
507
+
508
+ ### 6.5 Recursive Morpheme Segmentation
509
+
510
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
511
+
512
+ | Word | Suggested Split | Confidence | Stem |
513
+ |------|-----------------|------------|------|
514
+ | nedoloženou | **`ne-doložen-ou`** | 6.0 | `doložen` |
515
+ | nepochybovala | **`ne-po-chybovala`** | 6.0 | `chybovala` |
516
+ | nepostaral | **`ne-po-staral`** | 6.0 | `staral` |
517
+ | nacionálem | **`nacionál-em`** | 4.5 | `nacionál` |
518
+ | chimentiho | **`chimenti-ho`** | 4.5 | `chimenti` |
519
+ | prostonárodního | **`prostonárodní-ho`** | 4.5 | `prostonárodní` |
520
+ | klokotských | **`klokotský-ch`** | 4.5 | `klokotský` |
521
+ | bibliografického | **`bibliografické-ho`** | 4.5 | `bibliografické` |
522
+ | nesvědčily | **`ne-svědčily`** | 4.5 | `svědčily` |
523
+ | nenavázali | **`ne-navázali`** | 4.5 | `navázali` |
524
+ | ibragimovem | **`ibragimov-em`** | 4.5 | `ibragimov` |
525
+ | zeměplošských | **`zeměplošský-ch`** | 4.5 | `zeměplošský` |
526
+ | hliníkových | **`hliníkový-ch`** | 4.5 | `hliníkový` |
527
+ | etylenglykolem | **`etylenglykol-em`** | 4.5 | `etylenglykol` |
528
+ | mnohosamicového | **`mnohosamicové-ho`** | 4.5 | `mnohosamicové` |
529
+
530
+ ### 6.6 Linguistic Interpretation
531
+
532
+ > **Automated Insight:**
533
+ The language Czech shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
534
+
535
+ ---
536
+ ## 7. Summary & Recommendations
537
 
538
  ![Performance Dashboard](visualizations/performance_dashboard.png)
539
 
 
541
 
542
  | Component | Recommended | Rationale |
543
  |-----------|-------------|-----------|
544
+ | Tokenizer | **64k BPE** | Best compression (4.59x) |
545
+ | N-gram | **2-gram** | Lowest perplexity (449) |
546
+ | Markov | **Context-4** | Highest predictability (94.4%) |
547
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
548
 
549
+
550
  ---
551
  ## Appendix: Metrics Glossary & Interpretation Guide
552
 
 
736
  author = {Kamali, Omar},
737
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
738
  year = {2025},
739
+ doi = {10.5281/zenodo.18073153},
740
+ publisher = {Zenodo},
741
  url = {https://huggingface.co/wikilangs}
742
  institution = {Omneity Labs}
743
  }
 
753
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
754
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
755
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
756
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
757
  ---
758
  *Generated by Wikilangs Models Pipeline*
759
 
760
+ *Report Date: 2026-01-08 17:02:58*
models/embeddings/aligned/cs_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee73a43ec364b9064a5ce15c015db41e81048adbd31d37088f27c0d6393be945
3
+ size 2592173419
models/embeddings/aligned/cs_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "cs", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/cs_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fd4a4b58d02359e21ce26e20e4016044953c9fff3f7dc1b954aff16af53fbf2
3
+ size 65664
models/embeddings/aligned/cs_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cs",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 248433,
7
+ "vocab_size": 1503404
8
+ }
models/embeddings/aligned/cs_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d1c30712ea75937478c58b7fb8c1c3caf83309663a594ca004504a00193bb46
3
+ size 669559147
models/embeddings/aligned/cs_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "cs", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/cs_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd67a9b2efccb4b9ebbd0d7fdbe5b7b688d27853ec7334edef1eb0fa5a25a6f
3
+ size 4224
models/embeddings/aligned/cs_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cs",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 248433,
7
+ "vocab_size": 1503404
8
+ }
models/embeddings/aligned/cs_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d7e7c706963168c9f66fbb72372f5f81606711130ef7466f09e14ed88e7a4d0
3
+ size 1310430571
models/embeddings/aligned/cs_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "cs", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/cs_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcb8041a156dac565236b7bc938205bae3bdf77a3b16b4d40c14ffeb65c7cadb
3
+ size 16512
models/embeddings/aligned/cs_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cs",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 248433,
7
+ "vocab_size": 1503404
8
+ }
models/embeddings/monolingual/cs_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:034be2a68b65bcb979f8fd0ad025b0db9cab145669e15df9a7f1486b7f1a70d7
3
- size 2755994697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee73a43ec364b9064a5ce15c015db41e81048adbd31d37088f27c0d6393be945
3
+ size 2592173419
models/embeddings/monolingual/cs_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1659956
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 1503404
15
  }
models/embeddings/monolingual/cs_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4319e47ddfbdaf7e51194eb7d0f217f4c9be97174c76fb134b4ee3ffeb2e0897
3
- size 713148489
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d1c30712ea75937478c58b7fb8c1c3caf83309663a594ca004504a00193bb46
3
+ size 669559147
models/embeddings/monolingual/cs_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1659956
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 1503404
15
  }
models/embeddings/monolingual/cs_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8217b6554747d00396fbef9bba76405a29ea8883630e21ebcb1e06530642d7d1
3
- size 1394097225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d7e7c706963168c9f66fbb72372f5f81606711130ef7466f09e14ed88e7a4d0
3
+ size 1310430571
models/embeddings/monolingual/cs_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1659956
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 1503404
15
  }
models/subword_markov/cs_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1189c98a361867f4246e4275f1b3cc259688644d0e3fd5ef3e0f131b7fa96c1d
3
- size 812432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f728385f32db4d56eb98f894af4f69b3ed5eff19c0b9d1c11e941cba1f3f8c54
3
+ size 839807
models/subword_markov/cs_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_contexts": 10644,
6
- "total_transitions": 1795628327
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_contexts": 14369,
6
+ "total_transitions": 1591006734
7
  }
models/subword_markov/cs_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d80c00661e7ca277f295ef570e079bc0f40addf0e5da5aee5a2c8484c8ff03b1
3
- size 5964730
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c686d9a25d6802505309d41e46898ae44a9610103bbf6b43b177215526f053b
3
+ size 4923338
models/subword_markov/cs_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_contexts": 132235,
6
- "total_transitions": 1795047002
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_contexts": 123767,
6
+ "total_transitions": 1590425862
7
  }
models/subword_markov/cs_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bb7c3a0114ea8d93ca96ea2f76cc79ae31651b3708426fdbdca9ef5019f3703
3
- size 30364313
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2efc4336d93106340240bfa8c0fcdb41850c4e711995576d18cac4a39b2921fc
3
+ size 21935499
models/subword_markov/cs_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_contexts": 722190,
6
- "total_transitions": 1794465677
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_contexts": 583275,
6
+ "total_transitions": 1589844990
7
  }
models/subword_markov/cs_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54f5b723aa026548afb4a796c1f266538d54dad4a3b88971d09bae038aa52c12
3
- size 130573936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16d70dfb8facb64efbcd9b152d807fa295c04a1b0c433d6daa1b99f45fcf62c6
3
+ size 89927521
models/subword_markov/cs_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_contexts": 4107972,
6
- "total_transitions": 1793884352
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_contexts": 2782368,
6
+ "total_transitions": 1589264118
7
  }
models/subword_ngram/cs_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:699c97e596d231e7e2c92b0a1a01c3fee395be13f0eaebd6cc7a5de244db366b
3
- size 505831
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3bca592f8a143683b686f0e37a72e554f552c21ef033b2ff317d7964000b9bc
3
+ size 433014
models/subword_ngram/cs_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_ngrams": 35551,
6
- "total_ngrams": 1795628327
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_ngrams": 30223,
6
+ "total_ngrams": 1591006734
7
  }
models/subword_ngram/cs_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d7a587c6dec2716a1308f0986b57b783fb4e190cfa5d13628ba80dfd2ade9cf
3
- size 4237349
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39fc7ee4da840ca0792f53822766d4e880f321021b7b2192a655d5d175332d23
3
+ size 3254803
models/subword_ngram/cs_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_ngrams": 338538,
6
- "total_ngrams": 1795047002
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_ngrams": 255109,
6
+ "total_ngrams": 1590425862
7
  }
models/subword_ngram/cs_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4c9df563dea7bf57f71855ad9bbb6313209791d033cad384a57f751da02a0f2
3
- size 29932999
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b789c05230d10bdbd3be1e9661bec890f8aa337f333ec24cdfd811b63240337a
3
+ size 20392535
models/subword_ngram/cs_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "cs",
5
- "unique_ngrams": 2482399,
6
- "total_ngrams": 1794465677
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "cs",
5
+ "unique_ngrams": 1646964,
6
+ "total_ngrams": 1589844990
7
  }
models/subword_ngram/cs_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c457c38c3eb730bc5b170bc78b1ffb26e1125256cf7ff9765eaec66b334f858a
3
+ size 80495807
models/subword_ngram/cs_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "cs",
5
+ "unique_ngrams": 6437902,
6
+ "total_ngrams": 1589264118
7
+ }
models/tokenizer/cs_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6f21f9f740610ed50f3312df4811940dcbc3c9f89f0b0898ad2cb28d0733451
3
- size 514778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a79a59e7e6f05b35164ccd20e22fd357803c388d68fffcb0739fea4e66115909
3
+ size 514491
models/tokenizer/cs_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/cs_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:073977c2b03615dfcc7f5b7f9a464f331d5c93321d46434604c98a0295b16fa8
3
- size 804922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d519e9300c95d83a96fa4f4de0a13584fb4b643d9dd8c8d09f4fad15dff835
3
+ size 803835
models/tokenizer/cs_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/cs_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8da8677349daac2fa7f0977407831d7669c7cde4e4213febebdb1aeec967857f
3
- size 1402061
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e547414ac25c5799f3cbcf8c9d78e1473f21398b1b2e6e9e6d4668900be78de
3
+ size 1397324
models/tokenizer/cs_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/cs_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f1bbe8a7e6a224eed7d53e339bcf4df48d3fd66f77f1bf2609e42d9ab6b8f5b
3
- size 374540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2101aaa123afbd98d94b7170eb4cf00b0c5ef0dfc983c7359e00730b098aed0
3
+ size 374590
models/tokenizer/cs_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/cs_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8a85a74123473b20ac5f44858db1993e00f646437af9ce9d0e0b0194c590114
3
- size 15653412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f4ac7c5f2fe22fcc2625fb967be7f5b9111f19cfce6c05ef270a509337034c8
3
+ size 28462430
models/vocabulary/cs_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "cs",
3
- "vocabulary_size": 1000000,
 
4
  "statistics": {
5
- "type_token_ratio": 0.019459510539549023,
6
  "coverage": {
7
- "top_100": 0.2667156178234961,
8
- "top_1000": 0.459097000188151,
9
- "top_5000": 0.6309909217632615,
10
- "top_10000": 0.7050773559377174
11
  },
12
- "hapax_count": 3049908,
13
- "hapax_ratio": 0.5888045382175596,
14
- "total_documents": 581325
15
  }
16
  }
 
1
  {
2
  "language": "cs",
3
+ "vocabulary_size": 1830714,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.01593860093466459,
7
  "coverage": {
8
+ "top_100": 0.26912381903885885,
9
+ "top_1000": 0.4535615447036626,
10
+ "top_5000": 0.6242837292543639,
11
+ "top_10000": 0.7003857714922751
12
  },
13
+ "hapax_count": 1988181,
14
+ "hapax_ratio": 0.520616827642551,
15
+ "total_documents": 580872
16
  }
17
  }
models/vocabulary/cs_vocabulary_top.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b61261bafc47e9fd112290a889706214bc8ebfd90036ac510310fd8a7f0f99bf
3
+ size 15708837
models/vocabulary/cs_vocabulary_top_metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "cs",
3
+ "vocabulary_size": 1000000,
4
+ "variant": "top",
5
+ "statistics": {
6
+ "type_token_ratio": 0.01593860093466459,
7
+ "coverage": {
8
+ "top_100": 0.26912381903885885,
9
+ "top_1000": 0.4535615447036626,
10
+ "top_5000": 0.6242837292543639,
11
+ "top_10000": 0.7003857714922751
12
+ },
13
+ "hapax_count": 1988181,
14
+ "hapax_ratio": 0.520616827642551,
15
+ "total_documents": 580872,
16
+ "top_vocab_size": 1000000,
17
+ "coverage_ratio": 0.983133787887407,
18
+ "tokens_excluded": 830714
19
+ }
20
+ }
models/word_markov/cs_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8eae25b71953a72a3b4ad0cdb7cc9159ab32d4feb13cd01007eba2a1452ef77d
3
- size 529880714
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daba16a55ffa7e1547d39e480bc3ed52259c7bc93bc3947e6db1312e0e5cdc35
3
+ size 571097267
models/word_markov/cs_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "cs",
5
- "unique_contexts": 5180358,
6
- "total_transitions": 336349903
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "cs",
5
+ "unique_contexts": 3817910,
6
+ "total_transitions": 239019518
7
  }