Blessmore commited on
Commit
1bfa3f4
1 Parent(s): 4f059ef

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Fast_text_50_dim/shona_fasttext_vectors_50d.kv filter=lfs diff=lfs merge=lfs -text
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Fast_text_50_dim/shona_fasttext_vectors_50d.kv filter=lfs diff=lfs merge=lfs -text
37
+ Fast_text_100_dim/shona_corpus_E.txt filter=lfs diff=lfs merge=lfs -text
38
+ Fast_text_100_dim/shona_fasttext_vectors_100d.kv filter=lfs diff=lfs merge=lfs -text
Fast_text_100_dim/.ipynb_checkpoints/FAST_TEXT -100-checkpoint.ipynb ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from gensim.models import FastText\n",
10
+ "import regex as re\n",
11
+ "import time\n",
12
+ "import os\n",
13
+ "from gensim.utils import simple_preprocess\n",
14
+ "from gensim.models import FastText\n",
15
+ "import re"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "\n",
25
+ "def preprocess_text(text):\n",
26
+ " text = text.lower() # Lowercase\n",
27
+ " text = re.sub(r'[^\\w\\s]', '', text) # Remove punctuation\n",
28
+ " return simple_preprocess(text)\n",
29
+ "\n",
30
+ "def read_corpus(file_path):\n",
31
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
32
+ " for line in file:\n",
33
+ " yield preprocess_text(line)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "corpus_file_path = 'shona_corpus_E.txt'\n",
43
+ "# Read and preprocess the corpus\n",
44
+ "sentences = list(read_corpus(corpus_file_path))\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 4,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "data": {
54
+ "text/plain": [
55
+ "[['mavambo',\n",
56
+ " 'kusikwa',\n",
57
+ " 'kwezvinhu',\n",
58
+ " 'zvose',\n",
59
+ " 'pakutanga',\n",
60
+ " 'mwari',\n",
61
+ " 'akasika',\n",
62
+ " 'denga',\n",
63
+ " 'nepasi'],\n",
64
+ " ['zvino',\n",
65
+ " 'rakanga',\n",
66
+ " 'risina',\n",
67
+ " 'chiumbo',\n",
68
+ " 'risina',\n",
69
+ " 'uye',\n",
70
+ " 'rakanga',\n",
71
+ " 'riri',\n",
72
+ " 'pamusoro',\n",
73
+ " 'pehwenje'],\n",
74
+ " ['mweya', 'wamwari', 'wakanga', 'uchidzengerera', 'pamusoro', 'pemvura']]"
75
+ ]
76
+ },
77
+ "execution_count": 4,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "sentences[:3]"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "start_time = time.time()\n",
93
+ "\n",
94
+ "# Train FastText model\n",
95
+ "model = FastText(\n",
96
+ " sentences, \n",
97
+ " vector_size=100, # Higher dimension for better performance\n",
98
+ " window=7, \n",
99
+ " min_count=5, \n",
100
+ " workers=4, \n",
101
+ " sg=1, # Skip-gram model\n",
102
+ " epochs=100, # More epochs for thorough training\n",
103
+ " bucket=2000000, # Large bucket size for handling subwords\n",
104
+ " min_n=3, # Minimum length of char n-grams\n",
105
+ " max_n=6 # Maximum length of char n-grams\n",
106
+ ")\n",
107
+ "end_time = time.time()\n",
108
+ "# Calculate the elapsed time\n",
109
+ "elapsed_time = end_time - start_time\n",
110
+ "print(\"Time taken:\", elapsed_time, \"minutes\")\n"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# Save the model\n",
120
+ "model.save(\"shona_fasttext_50d.model\")\n",
121
+ "model.wv.save(\"shona_fasttext_vectors_50d.kv\")"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": null,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "print(model)"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": []
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": []
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": []
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "def evaluate_similarity(model, word_pairs):\n",
161
+ " similarity_scores = []\n",
162
+ " for word1, word2, score in word_pairs:\n",
163
+ " similarity_score = model.wv.similarity(word1, word2)\n",
164
+ " similarity_scores.append((word1, word2, score, similarity_score))\n",
165
+ " print(\"Similarity task evaluation:\")\n",
166
+ " for word1, word2, human_score, model_score in similarity_scores:\n",
167
+ " print(f\"{word1}-{word2}: Human score = {human_score}, Model score = {model_score}\")\n",
168
+ "\n",
169
+ "# Example similarity word pairs\n",
170
+ "similarity_word_pairs = [(\"murume\", \"mukadzi\", 0.8), (\"mwana\", \"mukomana\", 0.6)]\n",
171
+ "evaluate_similarity(model, similarity_word_pairs)\n"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
181
+ " d = model.wv[b] - model.wv[a] + model.wv[c]\n",
182
+ " closest_words = model.wv.similar_by_vector(d, topn=topn + 3) # Add extra to ensure we get at least topn unique words\n",
183
+ " result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
184
+ " return result_words[:topn]\n",
185
+ "\n",
186
+ "# Example usage\n",
187
+ "a = \"murume\" # man\n",
188
+ "b = \"mambo\" # king\n",
189
+ "c = \"mukadzi\" # woman\n",
190
+ "\n",
191
+ "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
192
+ "if predicted_words:\n",
193
+ " print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
194
+ "else:\n",
195
+ " print(\"No suitable words found.\")\n"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "# Perform Analogical Reasoning\n",
205
+ "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
206
+ " # Calculate the vector d as b - a + c\n",
207
+ " d = model.wv[b] - model.wv[a] + model.wv[c]\n",
208
+ " \n",
209
+ " # Find the words that best complete the analogy\n",
210
+ " closest_words = model.wv.similar_by_vector(d, topn=topn + 3) # Add extra to ensure we get at least topn unique words\n",
211
+ " result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
212
+ " \n",
213
+ " # Ensure we return exactly 'topn' words\n",
214
+ " return result_words[:topn]\n",
215
+ "\n",
216
+ "# Example usage\n",
217
+ "a = \"murume\" # man\n",
218
+ "b = \"sekuru\" # king\n",
219
+ "c = \"mukadzi\" # woman\n",
220
+ "\n",
221
+ "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
222
+ "if predicted_words:\n",
223
+ " print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
224
+ "else:\n",
225
+ " print(\"No suitable words found.\")"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": []
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "# Test similarity\n",
242
+ "similar_words = model.wv.most_similar(\"seka\", topn=10)\n",
243
+ "print(similar_words)"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": null,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": []
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "metadata": {},
257
+ "outputs": [],
258
+ "source": []
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": []
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": []
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": null,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": []
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": []
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": []
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": null,
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": []
301
+ }
302
+ ],
303
+ "metadata": {
304
+ "kernelspec": {
305
+ "display_name": "Python 3 (ipykernel)",
306
+ "language": "python",
307
+ "name": "python3"
308
+ },
309
+ "language_info": {
310
+ "codemirror_mode": {
311
+ "name": "ipython",
312
+ "version": 3
313
+ },
314
+ "file_extension": ".py",
315
+ "mimetype": "text/x-python",
316
+ "name": "python",
317
+ "nbconvert_exporter": "python",
318
+ "pygments_lexer": "ipython3",
319
+ "version": "3.9.12"
320
+ }
321
+ },
322
+ "nbformat": 4,
323
+ "nbformat_minor": 4
324
+ }
Fast_text_100_dim/FAST_TEXT -100.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Fast_text_100_dim/shona_corpus_E.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8a3674c729ea64dc6cdf21ad9567b12cfc396f53f19111abb94f022cb4c619
3
+ size 98750355
Fast_text_100_dim/shona_fasttext_100d.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82833fb1735675fdf13bd818eff25dfe07e7d74c1dd6b8b8e135727c28f847b
3
+ size 3506554
Fast_text_100_dim/shona_fasttext_100d.model.syn1neg.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98c793c31c7a0a93624404d2cf1c99e981dddc1a95f0e79aaa7072c36a27ea44
3
+ size 42891328
Fast_text_100_dim/shona_fasttext_100d.model.wv.vectors_ngrams.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72bf4d036fabf91fb5d82842d036bd1ba86ed08a11d37e392ef37f84c5c58cea
3
+ size 800000128
Fast_text_100_dim/shona_fasttext_100d.model.wv.vectors_vocab.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a95a0dbfd1cb3e5bffe627cf096b42692f8dcc415661a349aafe1ad5fb028290
3
+ size 42891328
Fast_text_100_dim/shona_fasttext_vectors_100d.kv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9383b68ad469c6309bb6cf7c643392d47e2f589cca13079935d5d4a300ce7f34
3
+ size 3501801
Fast_text_100_dim/shona_fasttext_vectors_100d.kv.vectors_ngrams.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72bf4d036fabf91fb5d82842d036bd1ba86ed08a11d37e392ef37f84c5c58cea
3
+ size 800000128
Fast_text_100_dim/shona_fasttext_vectors_100d.kv.vectors_vocab.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a95a0dbfd1cb3e5bffe627cf096b42692f8dcc415661a349aafe1ad5fb028290
3
+ size 42891328