anuragshas
/

en-hi-transliteration

Model card Files Files and versions Community

anuragshas commited on Apr 2, 2022

Commit

a8cc40a

•

1 Parent(s): 69be551

Intial Commit

Browse files

Files changed (6) hide show

README.md +53 -0
hi_scripts.json +25 -0
hi_v2_model.pth +3 -0
pred_test.json +0 -0
training_script.ipynb +0 -0
xmltodict.ipynb +82 -0

README.md CHANGED Viewed

@@ -1,3 +1,56 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+## Dataset
+NEWS2018 DATASET_04, Task ID: M-EnHi http://workshop.colips.org/news2018/dataset.html
+## Notebooks
+- `xmltodict.ipynb` contains the code to convert the `xml` files to `json` for training
+- `training_script.ipynb` contains the code for training and inference. It is a modified version of https://github.com/AI4Bharat/IndianNLP-Transliteration/blob/master/NoteBooks/Xlit_TrainingSetup_condensed.ipynb
+## Predictions
+`pred_test.json` contains top-10 predictions on the validation set of the dataset
+## Evaluation Scores on validation set
+TOP 10 SCORES FOR 1000 SAMPLES:
+|Metrics   |    Score   |
+| ----------- | ----------- |
+|ACC:      |    0.703000|
+|Mean F-score:| 0.949289|
+|MRR:         | 0.486549|
+|MAP_ref:     | 0.381000|
+TOP 5 SCORES FOR 1000 SAMPLES:
+|Metrics   |    Score   |
+| ----------- | ----------- |
+|ACC:          |0.621000|
+|Mean F-score: |0.937985|
+|MRR:          |0.475033|
+|MAP_ref:      |0.381000|
+TOP 3 SCORES FOR 1000 SAMPLES:
+|Metrics   |    Score   |
+| ----------- | ----------- |
+|ACC:          |0.560000|
+|Mean F-score: |0.927025|
+|MRR:          |0.461333|
+|MAP_ref:      |0.381000|
+TOP 2 SCORES FOR 1000 SAMPLES:
+|Metrics   |    Score   |
+| ----------- | ----------- |
+|ACC:      |    0.502000|
+|Mean F-score: | 0.913697|
+|MRR:         | 0.442000|
+|MAP_ref:     | 0.381000|
+TOP 1 SCORES FOR 1000 SAMPLES:
+|Metrics   |    Score   |
+| ----------- | ----------- |
+|ACC:         | 0.382000|
+|Mean F-score: | 0.881272|
+|MRR:          | 0.382000|
+|MAP_ref:      | 0.380500|

hi_scripts.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "WARNING" : " !!! Do not modify the Order of Glyph List !!!",
+    "UNICODE" : {"name": "devanagari", "begin":2304, "end":2431},
+    "LANGUAGE": "hindi",
+    "glyphs" : [
+        "ऄ", "अ", "आ", "इ", "ई", "उ", "ऊ","ऍ", "ऎ", "ए", "ऐ",
+        "ऑ", "ऒ", "ओ", "औ","ऋ","ॠ","ऌ","ॡ","ॲ", "ॐ",
+        "क", "ख", "ग", "घ", "ङ", "च", "छ", "ज", "झ", "ञ", "ट", "ठ", "ड", "ढ", "ण",
+        "त", "थ", "द", "ध", "न", "ऩ", "प", "फ", "ब", "भ", "म", "य", "र", "ऱ", "ल",
+        "ळ", "ऴ", "व", "श", "ष", "स", "ह", "क़", "ख़", "ग़", "ज़", "ड़", "ढ़", "फ़", "य़",
+        "्", "ा", "ि", "ी", "ु", "ू", "ॅ", "ॆ", "े", "ै", "ॉ", "ॊ", "ो", "ौ",
+        "ृ", "ॄ", "ॢ", "ॣ", "ँ", "ं", "ः", "़", "॑",  "ऽ", "॥",
+        "\u200c", "\u200d"
+    ],
+    "numsym_map" : {
+    "0" : ["०"], "1" : ["१"], "2" : ["२"], "3" : ["३"], "4" : ["४"],
+    "5" : ["५"], "6" : ["६"], "7" : ["७"], "8" : ["८"], "9" : ["९"],
+    "." : ["।", "॰"]
+    }
+    }

hi_v2_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89d3dd4e5fa7ea355c194fce3ecce1fd5e953e08784db26cacbe5993d1cd4eae
+size 40927419

pred_test.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_script.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

xmltodict.ipynb ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import xmltodict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_xml_to_json(input_filename, output_filename):\n",
+    "    with open(input_filename) as xml_file:\n",
+    "        data_dict = xmltodict.parse(xml_file.read())\n",
+    "    val_dict = {}\n",
+    "    for x in data_dict['TransliterationCorpus']['Name']:\n",
+    "        val_dict[x['SourceName']] = []\n",
+    "        if isinstance(x['TargetName'],list):\n",
+    "            for y in x['TargetName']:\n",
+    "                val_dict[x['SourceName']].append(y['#text'])\n",
+    "        else:\n",
+    "            val_dict[x['SourceName']].append(x['TargetName']['#text'])\n",
+    "    json_data = json.dumps(val_dict, ensure_ascii=False)\n",
+    "        \n",
+    "    # Write the json data to output\n",
+    "    # json file\n",
+    "    with open(output_filename, \"w\") as json_file:\n",
+    "        json_file.write(json_data)\n",
+    "        json_file.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "convert_xml_to_json(\"NEWS2018_M-EnHi_dev.xml\", \"test.json\")\n",
+    "convert_xml_to_json(\"NEWS2018_M-EnHi_trn.xml\", \"train.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "72eda931ce19f909a11c0956f8f945c55c4564a332ca55ff029bf31469cdd29f"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.5 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}