Spaces:
Runtime error
Runtime error
import os | |
import glob | |
import argparse | |
import xml.etree.ElementTree as ET | |
import re | |
import json | |
from pathlib import Path | |
from tqdm import tqdm | |
from ftfy import fix_text | |
def convert_collection(args): | |
print('converting collection....') | |
xml_list = list(Path(args.input_dir).rglob('*.xml')) | |
output_path = os.path.join(args.output_dir, 'trec21.json') | |
output_json_file = open(output_path, 'w', encoding='utf-8', newline='\n') | |
for i in tqdm(range(len(xml_list))): | |
parse_result = parse_xml(xml_list[i]) | |
result_dict = { | |
'id': parse_result[0], | |
'contents': f'{parse_result[1]} {parse_result[2]} {parse_result[3]} {parse_result[4]} {parse_result[5]}', | |
'title': parse_result[1], | |
'condition': parse_result[2], | |
'summary': parse_result[3], | |
'detailed_description': parse_result[4], | |
'eligibility': parse_result[5] | |
} | |
output_json_file.write(json.dumps(result_dict) + '\n') | |
output_json_file.close() | |
def parse_xml(file_dir): | |
xml = ET.parse(file_dir) | |
doc_id = ''.join(xml.find('.//nct_id').itertext()) | |
title = xml.find('.//official_title') | |
if not title: | |
title = xml.find('.//brief_title') | |
title = ''.join(title.itertext()) | |
condition = xml.find('.//condition') | |
condition = ''.join(condition.itertext()) if condition else '' | |
summary = xml.find('.//brief_summary') | |
summary = ''.join(summary.itertext()) if summary else '' | |
detailed_description = xml.find('.//detailed_description') | |
detailed_description = ''.join(detailed_description.itertext()) if detailed_description else '' | |
eligibility = xml.find('.//eligibility/criteria') | |
eligibility = ''.join(eligibility.itertext()) if eligibility else '' | |
doc_id = re.sub('\s\s+'," ", doc_id) | |
title = re.sub('\s\s+'," ", title) | |
condition = re.sub('\s\s+'," ", condition) | |
summary = re.sub('\s\s+'," ", summary) | |
detailed_description = re.sub('\s\s+'," ", detailed_description) | |
eligibility = re.sub('\s\s+'," ", eligibility) | |
doc_id = fix_text(doc_id) | |
title = fix_text(title) | |
condition = fix_text(condition) | |
summary = fix_text(summary) | |
detailed_description = fix_text(detailed_description) | |
eligibility = fix_text(eligibility) | |
return [doc_id, title, condition, summary, detailed_description, eligibility] | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--input_dir", required=True, help='input directory to trec xml data files') | |
parser.add_argument('--output_dir', required=True, help='output folder for json files') | |
args = parser.parse_args() | |
if not os.path.exists(args.output_dir): | |
os.makedirs(args.output_dir) | |
convert_collection(args) | |
print('Done!') | |