NetsPresso_QA / scripts /trec-ct /convert_trec21_ct_to_json.py
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
raw
history blame
No virus
2.83 kB
import os
import glob
import argparse
import xml.etree.ElementTree as ET
import re
import json
from pathlib import Path
from tqdm import tqdm
from ftfy import fix_text
def convert_collection(args):
print('converting collection....')
xml_list = list(Path(args.input_dir).rglob('*.xml'))
output_path = os.path.join(args.output_dir, 'trec21.json')
output_json_file = open(output_path, 'w', encoding='utf-8', newline='\n')
for i in tqdm(range(len(xml_list))):
parse_result = parse_xml(xml_list[i])
result_dict = {
'id': parse_result[0],
'contents': f'{parse_result[1]} {parse_result[2]} {parse_result[3]} {parse_result[4]} {parse_result[5]}',
'title': parse_result[1],
'condition': parse_result[2],
'summary': parse_result[3],
'detailed_description': parse_result[4],
'eligibility': parse_result[5]
}
output_json_file.write(json.dumps(result_dict) + '\n')
output_json_file.close()
def parse_xml(file_dir):
xml = ET.parse(file_dir)
doc_id = ''.join(xml.find('.//nct_id').itertext())
title = xml.find('.//official_title')
if not title:
title = xml.find('.//brief_title')
title = ''.join(title.itertext())
condition = xml.find('.//condition')
condition = ''.join(condition.itertext()) if condition else ''
summary = xml.find('.//brief_summary')
summary = ''.join(summary.itertext()) if summary else ''
detailed_description = xml.find('.//detailed_description')
detailed_description = ''.join(detailed_description.itertext()) if detailed_description else ''
eligibility = xml.find('.//eligibility/criteria')
eligibility = ''.join(eligibility.itertext()) if eligibility else ''
doc_id = re.sub('\s\s+'," ", doc_id)
title = re.sub('\s\s+'," ", title)
condition = re.sub('\s\s+'," ", condition)
summary = re.sub('\s\s+'," ", summary)
detailed_description = re.sub('\s\s+'," ", detailed_description)
eligibility = re.sub('\s\s+'," ", eligibility)
doc_id = fix_text(doc_id)
title = fix_text(title)
condition = fix_text(condition)
summary = fix_text(summary)
detailed_description = fix_text(detailed_description)
eligibility = fix_text(eligibility)
return [doc_id, title, condition, summary, detailed_description, eligibility]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", required=True, help='input directory to trec xml data files')
parser.add_argument('--output_dir', required=True, help='output folder for json files')
args = parser.parse_args()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
convert_collection(args)
print('Done!')