| import csv | |
| import os | |
| import argparse | |
| import pandas as pd | |
| from utilities.anonymize import ( | |
| get_cities_list, | |
| get_abbreviation_dict_correction, | |
| reformat_to_report, | |
| anonymize_analyzer, | |
| anonymize_engine, | |
| add_space_to_comma_endpoint, | |
| get_list_not_deidentify, | |
| config_deidentify, | |
| ) | |
| from utilities.translate import get_translation_dict_correction, translate_report | |
| from utilities.convert import ( | |
| convert_df_no_header, | |
| convert_df, | |
| convert_json, | |
| convert_list_phenogenius, | |
| convert_pdf_to_text, | |
| ) | |
| from utilities.extract_hpo import add_biometrics, extract_hpo | |
| from utilities.get_model import get_models, get_nlp_marian | |
| import gc | |
| def main(): | |
| print("Code Starting") | |
| MarianText, _, _ = translate_report( | |
| Report, | |
| Last_name, | |
| First_name, | |
| nlp_fr, | |
| marian_fr_en, | |
| dict_correction, | |
| dict_abbreviation_correction, | |
| ) | |
| MarianText_report = reformat_to_report(MarianText, nlp_fr) | |
| del MarianText | |
| print("Translation and De-identification") | |
| ( | |
| MarianText_anonymize_report_analyze, | |
| analyzer_results_return, | |
| _, | |
| _, | |
| ) = anonymize_analyzer( | |
| MarianText_report, analyzer, proper_noun, Last_name, First_name | |
| ) | |
| print(MarianText_anonymize_report_analyze) | |
| MarianText_anonymize_report_engine = anonymize_engine( | |
| MarianText_report, analyzer_results_return, engine, nlp_fr | |
| ) | |
| MarianText_anonymize_report_engine_modif = pd.DataFrame( | |
| [x for x in MarianText_anonymize_report_engine.split("\n")] | |
| ) | |
| MarianText_anonymize_report_engine_df = MarianText_anonymize_report_engine_modif | |
| with open( | |
| os.path.join(args.result_dir, "Reports", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_translated_and_deindentified_report.txt", | |
| "w", | |
| ) as file: | |
| file.write( | |
| convert_df_no_header(MarianText_anonymize_report_engine_df).decode("utf-8") | |
| ) | |
| print( | |
| "Text file created successfully : " | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_translated_and_deindentified_report.txt" | |
| ) | |
| print("Summarization") | |
| MarianText_anonymized_reformat_space = add_space_to_comma_endpoint( | |
| MarianText_anonymize_report_engine, nlp_fr | |
| ) | |
| MarianText_anonymized_reformat_biometrics, _ = add_biometrics( | |
| MarianText_anonymized_reformat_space, nlp_fr | |
| ) | |
| clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics) | |
| del MarianText_anonymize_report_engine | |
| del MarianText_anonymized_reformat_space | |
| del MarianText_anonymized_reformat_biometrics | |
| clinphen_unsafe_check_raw = clinphen_unsafe | |
| clinphen_unsafe_check_raw["To keep in list"] = False | |
| clinphen_unsafe_check_raw["Confidence on extraction"] = "low" | |
| del clinphen_unsafe | |
| clinphen["Confidence on extraction"] = "high" | |
| clinphen["To keep in list"] = True | |
| cols = [ | |
| "HPO ID", | |
| "Phenotype name", | |
| "To keep in list", | |
| "No. occurrences", | |
| "Earliness (lower = earlier)", | |
| "Confidence on extraction", | |
| "Example sentence", | |
| ] | |
| clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index() | |
| clinphen_all = clinphen_all[cols] | |
| clinphen_df = clinphen_all | |
| clinphen_df_without_low_confidence = clinphen_df[ | |
| clinphen_df["To keep in list"] == True | |
| ] | |
| del clinphen | |
| del clinphen_unsafe_check_raw | |
| gc.collect() | |
| with open( | |
| os.path.join(args.result_dir, "TSV", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_summarized_report.tsv", | |
| "w", | |
| ) as file: | |
| file.write(convert_df(clinphen_df).decode("utf-8")) | |
| print( | |
| "Tsv file created successfully : " | |
| + os.path.join(args.result_dir, "TSV", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_summarized_report.tsv" | |
| ) | |
| with open( | |
| os.path.join(args.result_dir, "JSON", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_summarized_report.json", | |
| "w", | |
| ) as file: | |
| file.write(convert_json(clinphen_df_without_low_confidence)) | |
| print( | |
| "JSON file created successfully : " | |
| + os.path.join(args.result_dir, "JSON", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_summarized_report.json" | |
| ) | |
| with open( | |
| os.path.join(args.result_dir, "TXT", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_summarized_report.txt", | |
| "w", | |
| ) as file: | |
| file.write(convert_list_phenogenius(clinphen_df_without_low_confidence)) | |
| print( | |
| "Text file created successfully : " | |
| + os.path.join(args.result_dir, "TXT", "") | |
| + Report_id | |
| + "_" | |
| + Last_name | |
| + "_" | |
| + First_name | |
| + "_summarized_report.txt" | |
| ) | |
| if __name__ == "__main__": | |
| print("Welcome to the Clinfly app") | |
| parser = argparse.ArgumentParser(description="Description of clinfly arguments") | |
| parser.add_argument( | |
| "--file", | |
| type=str, | |
| help="the input file which contains the visits informations", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "--language", | |
| choices=["fr", "es", "de"], | |
| type=str, | |
| help="The language of the input : fr, es , de", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "--model_dir", | |
| default=os.path.expanduser("~"), | |
| type=str, | |
| help="The directory where the models will be downloaded.", | |
| ) | |
| parser.add_argument( | |
| "--result_dir", | |
| default="Results", | |
| type=str, | |
| help="The directory where the results will be placed.", | |
| ) | |
| args = parser.parse_args() | |
| if not os.path.exists(args.model_dir): | |
| os.makedirs(args.model_dir) | |
| if not os.path.exists(args.result_dir): | |
| os.makedirs(args.result_dir) | |
| if not os.path.exists(os.path.join(args.result_dir, "Reports")): | |
| os.makedirs(os.path.join(args.result_dir, "Reports")) | |
| if not os.path.exists(os.path.join(args.result_dir, "TSV")): | |
| os.makedirs(os.path.join(args.result_dir, "TSV")) | |
| if not os.path.exists(os.path.join(args.result_dir, "JSON")): | |
| os.makedirs(os.path.join(args.result_dir, "JSON")) | |
| if not os.path.exists(os.path.join(args.result_dir, "TXT")): | |
| os.makedirs(os.path.join(args.result_dir, "TXT")) | |
| print("Language chosen :", args.language) | |
| models_status = get_models(args.language, args.model_dir) | |
| dict_correction = get_translation_dict_correction() | |
| dict_abbreviation_correction = get_abbreviation_dict_correction() | |
| proper_noun = get_list_not_deidentify() | |
| cities_list = get_cities_list() | |
| analyzer, engine = config_deidentify(cities_list) | |
| nlp_fr, marian_fr_en = get_nlp_marian(args.language) | |
| file_name = args.file | |
| Report_id: str | |
| Last_name: str | |
| First_name: str | |
| Report: str | |
| if os.path.isfile(args.file): | |
| with open(file_name, 'r') as fichier: | |
| for ligne in fichier: | |
| elements = ligne.strip().split('\t') | |
| Report_id, Last_name, First_name, text_or_link = elements | |
| print("Report_id:", Report_id) | |
| print("Last_name:", Last_name) | |
| print("First_name:", First_name) | |
| if os.path.exists(text_or_link): | |
| if text_or_link.lower().endswith('.pdf'): | |
| print(f"Processing PDF file: {text_or_link}") | |
| Report = convert_pdf_to_text(text_or_link) | |
| else: | |
| print(f"Unsupported file type. Please provide a link to a PDF files.") | |
| else: | |
| Report = text_or_link | |
| print("Report:", Report) | |
| main() | |
| print() | |
| else: | |
| print("Input is not a file. Please provide a valid input.") | |