Blessmore
/

Fasttext_embeddings

Model card Files Files and versions Community

Blessmore commited on May 28

Commit

1bfa3f4

•

1 Parent(s): 4f059ef

Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +2 -0
Fast_text_100_dim/.ipynb_checkpoints/FAST_TEXT -100-checkpoint.ipynb +324 -0
Fast_text_100_dim/FAST_TEXT -100.ipynb +0 -0
Fast_text_100_dim/shona_corpus_E.txt +3 -0
Fast_text_100_dim/shona_fasttext_100d.model +3 -0
Fast_text_100_dim/shona_fasttext_100d.model.syn1neg.npy +3 -0
Fast_text_100_dim/shona_fasttext_100d.model.wv.vectors_ngrams.npy +3 -0
Fast_text_100_dim/shona_fasttext_100d.model.wv.vectors_vocab.npy +3 -0
Fast_text_100_dim/shona_fasttext_vectors_100d.kv +3 -0
Fast_text_100_dim/shona_fasttext_vectors_100d.kv.vectors_ngrams.npy +3 -0
Fast_text_100_dim/shona_fasttext_vectors_100d.kv.vectors_vocab.npy +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Fast_text_50_dim/shona_fasttext_vectors_50d.kv filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Fast_text_50_dim/shona_fasttext_vectors_50d.kv filter=lfs diff=lfs merge=lfs -text
+Fast_text_100_dim/shona_corpus_E.txt filter=lfs diff=lfs merge=lfs -text
+Fast_text_100_dim/shona_fasttext_vectors_100d.kv filter=lfs diff=lfs merge=lfs -text

Fast_text_100_dim/.ipynb_checkpoints/FAST_TEXT -100-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,324 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models import FastText\n",
+    "import regex as re\n",
+    "import time\n",
+    "import os\n",
+    "from gensim.utils import simple_preprocess\n",
+    "from gensim.models import FastText\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def preprocess_text(text):\n",
+    "    text = text.lower()  # Lowercase\n",
+    "    text = re.sub(r'[^\\w\\s]', '', text)  # Remove punctuation\n",
+    "    return simple_preprocess(text)\n",
+    "\n",
+    "def read_corpus(file_path):\n",
+    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "        for line in file:\n",
+    "            yield preprocess_text(line)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus_file_path = 'shona_corpus_E.txt'\n",
+    "# Read and preprocess the corpus\n",
+    "sentences = list(read_corpus(corpus_file_path))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['mavambo',\n",
+       "  'kusikwa',\n",
+       "  'kwezvinhu',\n",
+       "  'zvose',\n",
+       "  'pakutanga',\n",
+       "  'mwari',\n",
+       "  'akasika',\n",
+       "  'denga',\n",
+       "  'nepasi'],\n",
+       " ['zvino',\n",
+       "  'rakanga',\n",
+       "  'risina',\n",
+       "  'chiumbo',\n",
+       "  'risina',\n",
+       "  'uye',\n",
+       "  'rakanga',\n",
+       "  'riri',\n",
+       "  'pamusoro',\n",
+       "  'pehwenje'],\n",
+       " ['mweya', 'wamwari', 'wakanga', 'uchidzengerera', 'pamusoro', 'pemvura']]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentences[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start_time = time.time()\n",
+    "\n",
+    "# Train FastText model\n",
+    "model = FastText(\n",
+    "    sentences, \n",
+    "    vector_size=100,  # Higher dimension for better performance\n",
+    "    window=7, \n",
+    "    min_count=5, \n",
+    "    workers=4, \n",
+    "    sg=1,  # Skip-gram model\n",
+    "    epochs=100,  # More epochs for thorough training\n",
+    "    bucket=2000000,  # Large bucket size for handling subwords\n",
+    "    min_n=3,  # Minimum length of char n-grams\n",
+    "    max_n=6   # Maximum length of char n-grams\n",
+    ")\n",
+    "end_time = time.time()\n",
+    "# Calculate the elapsed time\n",
+    "elapsed_time = end_time - start_time\n",
+    "print(\"Time taken:\", elapsed_time, \"minutes\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the model\n",
+    "model.save(\"shona_fasttext_50d.model\")\n",
+    "model.wv.save(\"shona_fasttext_vectors_50d.kv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_similarity(model, word_pairs):\n",
+    "    similarity_scores = []\n",
+    "    for word1, word2, score in word_pairs:\n",
+    "        similarity_score = model.wv.similarity(word1, word2)\n",
+    "        similarity_scores.append((word1, word2, score, similarity_score))\n",
+    "    print(\"Similarity task evaluation:\")\n",
+    "    for word1, word2, human_score, model_score in similarity_scores:\n",
+    "        print(f\"{word1}-{word2}: Human score = {human_score}, Model score = {model_score}\")\n",
+    "\n",
+    "# Example similarity word pairs\n",
+    "similarity_word_pairs = [(\"murume\", \"mukadzi\", 0.8), (\"mwana\", \"mukomana\", 0.6)]\n",
+    "evaluate_similarity(model, similarity_word_pairs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
+    "    d = model.wv[b] - model.wv[a] + model.wv[c]\n",
+    "    closest_words = model.wv.similar_by_vector(d, topn=topn + 3)  # Add extra to ensure we get at least topn unique words\n",
+    "    result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
+    "    return result_words[:topn]\n",
+    "\n",
+    "# Example usage\n",
+    "a = \"murume\"  # man\n",
+    "b = \"mambo\"   # king\n",
+    "c = \"mukadzi\" # woman\n",
+    "\n",
+    "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
+    "if predicted_words:\n",
+    "    print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
+    "else:\n",
+    "    print(\"No suitable words found.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Perform Analogical Reasoning\n",
+    "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
+    "    # Calculate the vector d as b - a + c\n",
+    "    d = model.wv[b] - model.wv[a] + model.wv[c]\n",
+    "    \n",
+    "    # Find the words that best complete the analogy\n",
+    "    closest_words = model.wv.similar_by_vector(d, topn=topn + 3)  # Add extra to ensure we get at least topn unique words\n",
+    "    result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
+    "    \n",
+    "    # Ensure we return exactly 'topn' words\n",
+    "    return result_words[:topn]\n",
+    "\n",
+    "# Example usage\n",
+    "a = \"murume\"  # man\n",
+    "b = \"sekuru\"   # king\n",
+    "c = \"mukadzi\" # woman\n",
+    "\n",
+    "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
+    "if predicted_words:\n",
+    "    print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
+    "else:\n",
+    "    print(\"No suitable words found.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test similarity\n",
+    "similar_words = model.wv.most_similar(\"seka\", topn=10)\n",
+    "print(similar_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Fast_text_100_dim/FAST_TEXT -100.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Fast_text_100_dim/shona_corpus_E.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8a3674c729ea64dc6cdf21ad9567b12cfc396f53f19111abb94f022cb4c619
+size 98750355

Fast_text_100_dim/shona_fasttext_100d.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c82833fb1735675fdf13bd818eff25dfe07e7d74c1dd6b8b8e135727c28f847b
+size 3506554

Fast_text_100_dim/shona_fasttext_100d.model.syn1neg.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98c793c31c7a0a93624404d2cf1c99e981dddc1a95f0e79aaa7072c36a27ea44
+size 42891328

Fast_text_100_dim/shona_fasttext_100d.model.wv.vectors_ngrams.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72bf4d036fabf91fb5d82842d036bd1ba86ed08a11d37e392ef37f84c5c58cea
+size 800000128

Fast_text_100_dim/shona_fasttext_100d.model.wv.vectors_vocab.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a95a0dbfd1cb3e5bffe627cf096b42692f8dcc415661a349aafe1ad5fb028290
+size 42891328

Fast_text_100_dim/shona_fasttext_vectors_100d.kv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9383b68ad469c6309bb6cf7c643392d47e2f589cca13079935d5d4a300ce7f34
+size 3501801

Fast_text_100_dim/shona_fasttext_vectors_100d.kv.vectors_ngrams.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72bf4d036fabf91fb5d82842d036bd1ba86ed08a11d37e392ef37f84c5c58cea
+size 800000128

Fast_text_100_dim/shona_fasttext_vectors_100d.kv.vectors_vocab.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a95a0dbfd1cb3e5bffe627cf096b42692f8dcc415661a349aafe1ad5fb028290
+size 42891328