import os import glob import argparse import xml.etree.ElementTree as ET import re import json from pathlib import Path from tqdm import tqdm from ftfy import fix_text def convert_collection(args): print('converting collection....') xml_list = list(Path(args.input_dir).rglob('*.xml')) output_path = os.path.join(args.output_dir, 'trec21.json') output_json_file = open(output_path, 'w', encoding='utf-8', newline='\n') for i in tqdm(range(len(xml_list))): parse_result = parse_xml(xml_list[i]) result_dict = { 'id': parse_result[0], 'contents': f'{parse_result[1]} {parse_result[2]} {parse_result[3]} {parse_result[4]} {parse_result[5]}', 'title': parse_result[1], 'condition': parse_result[2], 'summary': parse_result[3], 'detailed_description': parse_result[4], 'eligibility': parse_result[5] } output_json_file.write(json.dumps(result_dict) + '\n') output_json_file.close() def parse_xml(file_dir): xml = ET.parse(file_dir) doc_id = ''.join(xml.find('.//nct_id').itertext()) title = xml.find('.//official_title') if not title: title = xml.find('.//brief_title') title = ''.join(title.itertext()) condition = xml.find('.//condition') condition = ''.join(condition.itertext()) if condition else '' summary = xml.find('.//brief_summary') summary = ''.join(summary.itertext()) if summary else '' detailed_description = xml.find('.//detailed_description') detailed_description = ''.join(detailed_description.itertext()) if detailed_description else '' eligibility = xml.find('.//eligibility/criteria') eligibility = ''.join(eligibility.itertext()) if eligibility else '' doc_id = re.sub('\s\s+'," ", doc_id) title = re.sub('\s\s+'," ", title) condition = re.sub('\s\s+'," ", condition) summary = re.sub('\s\s+'," ", summary) detailed_description = re.sub('\s\s+'," ", detailed_description) eligibility = re.sub('\s\s+'," ", eligibility) doc_id = fix_text(doc_id) title = fix_text(title) condition = fix_text(condition) summary = fix_text(summary) detailed_description = fix_text(detailed_description) eligibility = fix_text(eligibility) return [doc_id, title, condition, summary, detailed_description, eligibility] if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--input_dir", required=True, help='input directory to trec xml data files') parser.add_argument('--output_dir', required=True, help='output folder for json files') args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) convert_collection(args) print('Done!')