anuragshas
commited on
Commit
•
a8cc40a
1
Parent(s):
69be551
Intial Commit
Browse files- README.md +53 -0
- hi_scripts.json +25 -0
- hi_v2_model.pth +3 -0
- pred_test.json +0 -0
- training_script.ipynb +0 -0
- xmltodict.ipynb +82 -0
README.md
CHANGED
@@ -1,3 +1,56 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
4 |
+
|
5 |
+
## Dataset
|
6 |
+
NEWS2018 DATASET_04, Task ID: M-EnHi http://workshop.colips.org/news2018/dataset.html
|
7 |
+
|
8 |
+
## Notebooks
|
9 |
+
- `xmltodict.ipynb` contains the code to convert the `xml` files to `json` for training
|
10 |
+
- `training_script.ipynb` contains the code for training and inference. It is a modified version of https://github.com/AI4Bharat/IndianNLP-Transliteration/blob/master/NoteBooks/Xlit_TrainingSetup_condensed.ipynb
|
11 |
+
|
12 |
+
|
13 |
+
## Predictions
|
14 |
+
`pred_test.json` contains top-10 predictions on the validation set of the dataset
|
15 |
+
|
16 |
+
## Evaluation Scores on validation set
|
17 |
+
TOP 10 SCORES FOR 1000 SAMPLES:
|
18 |
+
|Metrics | Score |
|
19 |
+
| ----------- | ----------- |
|
20 |
+
|ACC: | 0.703000|
|
21 |
+
|Mean F-score:| 0.949289|
|
22 |
+
|MRR: | 0.486549|
|
23 |
+
|MAP_ref: | 0.381000|
|
24 |
+
|
25 |
+
|
26 |
+
TOP 5 SCORES FOR 1000 SAMPLES:
|
27 |
+
|Metrics | Score |
|
28 |
+
| ----------- | ----------- |
|
29 |
+
|ACC: |0.621000|
|
30 |
+
|Mean F-score: |0.937985|
|
31 |
+
|MRR: |0.475033|
|
32 |
+
|MAP_ref: |0.381000|
|
33 |
+
|
34 |
+
TOP 3 SCORES FOR 1000 SAMPLES:
|
35 |
+
|Metrics | Score |
|
36 |
+
| ----------- | ----------- |
|
37 |
+
|ACC: |0.560000|
|
38 |
+
|Mean F-score: |0.927025|
|
39 |
+
|MRR: |0.461333|
|
40 |
+
|MAP_ref: |0.381000|
|
41 |
+
|
42 |
+
TOP 2 SCORES FOR 1000 SAMPLES:
|
43 |
+
|Metrics | Score |
|
44 |
+
| ----------- | ----------- |
|
45 |
+
|ACC: | 0.502000|
|
46 |
+
|Mean F-score: | 0.913697|
|
47 |
+
|MRR: | 0.442000|
|
48 |
+
|MAP_ref: | 0.381000|
|
49 |
+
|
50 |
+
TOP 1 SCORES FOR 1000 SAMPLES:
|
51 |
+
|Metrics | Score |
|
52 |
+
| ----------- | ----------- |
|
53 |
+
|ACC: | 0.382000|
|
54 |
+
|Mean F-score: | 0.881272|
|
55 |
+
|MRR: | 0.382000|
|
56 |
+
|MAP_ref: | 0.380500|
|
hi_scripts.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"WARNING" : " !!! Do not modify the Order of Glyph List !!!",
|
3 |
+
"UNICODE" : {"name": "devanagari", "begin":2304, "end":2431},
|
4 |
+
"LANGUAGE": "hindi",
|
5 |
+
|
6 |
+
"glyphs" : [
|
7 |
+
|
8 |
+
"ऄ", "अ", "आ", "इ", "ई", "उ", "ऊ","ऍ", "ऎ", "ए", "ऐ",
|
9 |
+
"ऑ", "ऒ", "ओ", "औ","ऋ","ॠ","ऌ","ॡ","ॲ", "ॐ",
|
10 |
+
"क", "ख", "ग", "घ", "ङ", "च", "छ", "ज", "झ", "ञ", "ट", "ठ", "ड", "ढ", "ण",
|
11 |
+
"त", "थ", "द", "ध", "न", "ऩ", "प", "फ", "ब", "भ", "म", "य", "र", "ऱ", "ल",
|
12 |
+
"ळ", "ऴ", "व", "श", "ष", "स", "ह", "क़", "ख़", "ग़", "ज़", "ड़", "ढ़", "फ़", "य़",
|
13 |
+
"्", "ा", "ि", "ी", "ु", "ू", "ॅ", "ॆ", "े", "ै", "ॉ", "ॊ", "ो", "ौ",
|
14 |
+
"ृ", "ॄ", "ॢ", "ॣ", "ँ", "ं", "ः", "़", "॑", "ऽ", "॥",
|
15 |
+
"\u200c", "\u200d"
|
16 |
+
|
17 |
+
],
|
18 |
+
|
19 |
+
"numsym_map" : {
|
20 |
+
"0" : ["०"], "1" : ["१"], "2" : ["२"], "3" : ["३"], "4" : ["४"],
|
21 |
+
"5" : ["५"], "6" : ["६"], "7" : ["७"], "8" : ["८"], "9" : ["९"],
|
22 |
+
"." : ["।", "॰"]
|
23 |
+
}
|
24 |
+
|
25 |
+
}
|
hi_v2_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89d3dd4e5fa7ea355c194fce3ecce1fd5e953e08784db26cacbe5993d1cd4eae
|
3 |
+
size 40927419
|
pred_test.json
ADDED
The diff for this file is too large to render.
See raw diff
|
training_script.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
xmltodict.ipynb
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import json\n",
|
10 |
+
"import xmltodict"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 2,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"def convert_xml_to_json(input_filename, output_filename):\n",
|
20 |
+
" with open(input_filename) as xml_file:\n",
|
21 |
+
" data_dict = xmltodict.parse(xml_file.read())\n",
|
22 |
+
" val_dict = {}\n",
|
23 |
+
" for x in data_dict['TransliterationCorpus']['Name']:\n",
|
24 |
+
" val_dict[x['SourceName']] = []\n",
|
25 |
+
" if isinstance(x['TargetName'],list):\n",
|
26 |
+
" for y in x['TargetName']:\n",
|
27 |
+
" val_dict[x['SourceName']].append(y['#text'])\n",
|
28 |
+
" else:\n",
|
29 |
+
" val_dict[x['SourceName']].append(x['TargetName']['#text'])\n",
|
30 |
+
" json_data = json.dumps(val_dict, ensure_ascii=False)\n",
|
31 |
+
" \n",
|
32 |
+
" # Write the json data to output\n",
|
33 |
+
" # json file\n",
|
34 |
+
" with open(output_filename, \"w\") as json_file:\n",
|
35 |
+
" json_file.write(json_data)\n",
|
36 |
+
" json_file.close()"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 3,
|
42 |
+
"metadata": {},
|
43 |
+
"outputs": [],
|
44 |
+
"source": [
|
45 |
+
"convert_xml_to_json(\"NEWS2018_M-EnHi_dev.xml\", \"test.json\")\n",
|
46 |
+
"convert_xml_to_json(\"NEWS2018_M-EnHi_trn.xml\", \"train.json\")"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": null,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": []
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"metadata": {
|
58 |
+
"interpreter": {
|
59 |
+
"hash": "72eda931ce19f909a11c0956f8f945c55c4564a332ca55ff029bf31469cdd29f"
|
60 |
+
},
|
61 |
+
"kernelspec": {
|
62 |
+
"display_name": "Python 3.9.5 ('base')",
|
63 |
+
"language": "python",
|
64 |
+
"name": "python3"
|
65 |
+
},
|
66 |
+
"language_info": {
|
67 |
+
"codemirror_mode": {
|
68 |
+
"name": "ipython",
|
69 |
+
"version": 3
|
70 |
+
},
|
71 |
+
"file_extension": ".py",
|
72 |
+
"mimetype": "text/x-python",
|
73 |
+
"name": "python",
|
74 |
+
"nbconvert_exporter": "python",
|
75 |
+
"pygments_lexer": "ipython3",
|
76 |
+
"version": "3.9.5"
|
77 |
+
},
|
78 |
+
"orig_nbformat": 4
|
79 |
+
},
|
80 |
+
"nbformat": 4,
|
81 |
+
"nbformat_minor": 2
|
82 |
+
}
|