{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import xmltodict" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def convert_xml_to_json(input_filename, output_filename):\n", " with open(input_filename) as xml_file:\n", " data_dict = xmltodict.parse(xml_file.read())\n", " val_dict = {}\n", " for x in data_dict['TransliterationCorpus']['Name']:\n", " val_dict[x['SourceName']] = []\n", " if isinstance(x['TargetName'],list):\n", " for y in x['TargetName']:\n", " val_dict[x['SourceName']].append(y['#text'])\n", " else:\n", " val_dict[x['SourceName']].append(x['TargetName']['#text'])\n", " json_data = json.dumps(val_dict, ensure_ascii=False)\n", " \n", " # Write the json data to output\n", " # json file\n", " with open(output_filename, \"w\") as json_file:\n", " json_file.write(json_data)\n", " json_file.close()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "convert_xml_to_json(\"NEWS2018_M-EnHi_dev.xml\", \"test.json\")\n", "convert_xml_to_json(\"NEWS2018_M-EnHi_trn.xml\", \"train.json\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "72eda931ce19f909a11c0956f8f945c55c4564a332ca55ff029bf31469cdd29f" }, "kernelspec": { "display_name": "Python 3.9.5 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }