anuragshas commited on
Commit
a8cc40a
1 Parent(s): 69be551

Intial Commit

Browse files
Files changed (6) hide show
  1. README.md +53 -0
  2. hi_scripts.json +25 -0
  3. hi_v2_model.pth +3 -0
  4. pred_test.json +0 -0
  5. training_script.ipynb +0 -0
  6. xmltodict.ipynb +82 -0
README.md CHANGED
@@ -1,3 +1,56 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ ## Dataset
6
+ NEWS2018 DATASET_04, Task ID: M-EnHi http://workshop.colips.org/news2018/dataset.html
7
+
8
+ ## Notebooks
9
+ - `xmltodict.ipynb` contains the code to convert the `xml` files to `json` for training
10
+ - `training_script.ipynb` contains the code for training and inference. It is a modified version of https://github.com/AI4Bharat/IndianNLP-Transliteration/blob/master/NoteBooks/Xlit_TrainingSetup_condensed.ipynb
11
+
12
+
13
+ ## Predictions
14
+ `pred_test.json` contains top-10 predictions on the validation set of the dataset
15
+
16
+ ## Evaluation Scores on validation set
17
+ TOP 10 SCORES FOR 1000 SAMPLES:
18
+ |Metrics | Score |
19
+ | ----------- | ----------- |
20
+ |ACC: | 0.703000|
21
+ |Mean F-score:| 0.949289|
22
+ |MRR: | 0.486549|
23
+ |MAP_ref: | 0.381000|
24
+
25
+
26
+ TOP 5 SCORES FOR 1000 SAMPLES:
27
+ |Metrics | Score |
28
+ | ----------- | ----------- |
29
+ |ACC: |0.621000|
30
+ |Mean F-score: |0.937985|
31
+ |MRR: |0.475033|
32
+ |MAP_ref: |0.381000|
33
+
34
+ TOP 3 SCORES FOR 1000 SAMPLES:
35
+ |Metrics | Score |
36
+ | ----------- | ----------- |
37
+ |ACC: |0.560000|
38
+ |Mean F-score: |0.927025|
39
+ |MRR: |0.461333|
40
+ |MAP_ref: |0.381000|
41
+
42
+ TOP 2 SCORES FOR 1000 SAMPLES:
43
+ |Metrics | Score |
44
+ | ----------- | ----------- |
45
+ |ACC: | 0.502000|
46
+ |Mean F-score: | 0.913697|
47
+ |MRR: | 0.442000|
48
+ |MAP_ref: | 0.381000|
49
+
50
+ TOP 1 SCORES FOR 1000 SAMPLES:
51
+ |Metrics | Score |
52
+ | ----------- | ----------- |
53
+ |ACC: | 0.382000|
54
+ |Mean F-score: | 0.881272|
55
+ |MRR: | 0.382000|
56
+ |MAP_ref: | 0.380500|
hi_scripts.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "WARNING" : " !!! Do not modify the Order of Glyph List !!!",
3
+ "UNICODE" : {"name": "devanagari", "begin":2304, "end":2431},
4
+ "LANGUAGE": "hindi",
5
+
6
+ "glyphs" : [
7
+
8
+ "ऄ", "अ", "आ", "इ", "ई", "उ", "ऊ","ऍ", "ऎ", "ए", "ऐ",
9
+ "ऑ", "ऒ", "ओ", "औ","ऋ","ॠ","ऌ","ॡ","ॲ", "ॐ",
10
+ "क", "ख", "ग", "घ", "ङ", "च", "छ", "ज", "झ", "ञ", "ट", "ठ", "ड", "ढ", "ण",
11
+ "त", "थ", "द", "ध", "न", "ऩ", "प", "फ", "ब", "भ", "म", "य", "र", "ऱ", "ल",
12
+ "ळ", "ऴ", "व", "श", "ष", "स", "ह", "क़", "ख़", "ग़", "ज़", "ड़", "ढ़", "फ़", "य़",
13
+ "्", "ा", "ि", "ी", "ु", "ू", "ॅ", "ॆ", "े", "ै", "ॉ", "ॊ", "ो", "ौ",
14
+ "ृ", "ॄ", "ॢ", "ॣ", "ँ", "ं", "ः", "़", "॑", "ऽ", "॥",
15
+ "\u200c", "\u200d"
16
+
17
+ ],
18
+
19
+ "numsym_map" : {
20
+ "0" : ["०"], "1" : ["१"], "2" : ["२"], "3" : ["३"], "4" : ["४"],
21
+ "5" : ["५"], "6" : ["६"], "7" : ["७"], "8" : ["८"], "9" : ["९"],
22
+ "." : ["।", "॰"]
23
+ }
24
+
25
+ }
hi_v2_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d3dd4e5fa7ea355c194fce3ecce1fd5e953e08784db26cacbe5993d1cd4eae
3
+ size 40927419
pred_test.json ADDED
The diff for this file is too large to render. See raw diff
training_script.ipynb ADDED
The diff for this file is too large to render. See raw diff
xmltodict.ipynb ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import json\n",
10
+ "import xmltodict"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "def convert_xml_to_json(input_filename, output_filename):\n",
20
+ " with open(input_filename) as xml_file:\n",
21
+ " data_dict = xmltodict.parse(xml_file.read())\n",
22
+ " val_dict = {}\n",
23
+ " for x in data_dict['TransliterationCorpus']['Name']:\n",
24
+ " val_dict[x['SourceName']] = []\n",
25
+ " if isinstance(x['TargetName'],list):\n",
26
+ " for y in x['TargetName']:\n",
27
+ " val_dict[x['SourceName']].append(y['#text'])\n",
28
+ " else:\n",
29
+ " val_dict[x['SourceName']].append(x['TargetName']['#text'])\n",
30
+ " json_data = json.dumps(val_dict, ensure_ascii=False)\n",
31
+ " \n",
32
+ " # Write the json data to output\n",
33
+ " # json file\n",
34
+ " with open(output_filename, \"w\") as json_file:\n",
35
+ " json_file.write(json_data)\n",
36
+ " json_file.close()"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 3,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "convert_xml_to_json(\"NEWS2018_M-EnHi_dev.xml\", \"test.json\")\n",
46
+ "convert_xml_to_json(\"NEWS2018_M-EnHi_trn.xml\", \"train.json\")"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": []
55
+ }
56
+ ],
57
+ "metadata": {
58
+ "interpreter": {
59
+ "hash": "72eda931ce19f909a11c0956f8f945c55c4564a332ca55ff029bf31469cdd29f"
60
+ },
61
+ "kernelspec": {
62
+ "display_name": "Python 3.9.5 ('base')",
63
+ "language": "python",
64
+ "name": "python3"
65
+ },
66
+ "language_info": {
67
+ "codemirror_mode": {
68
+ "name": "ipython",
69
+ "version": 3
70
+ },
71
+ "file_extension": ".py",
72
+ "mimetype": "text/x-python",
73
+ "name": "python",
74
+ "nbconvert_exporter": "python",
75
+ "pygments_lexer": "ipython3",
76
+ "version": "3.9.5"
77
+ },
78
+ "orig_nbformat": 4
79
+ },
80
+ "nbformat": 4,
81
+ "nbformat_minor": 2
82
+ }