|
import csv
|
|
import os
|
|
import argparse
|
|
import pandas as pd
|
|
from utilities.anonymize import (
|
|
get_cities_list,
|
|
get_abbreviation_dict_correction,
|
|
reformat_to_report,
|
|
anonymize_analyzer,
|
|
anonymize_engine,
|
|
add_space_to_comma_endpoint,
|
|
get_list_not_deidentify,
|
|
config_deidentify,
|
|
)
|
|
from utilities.translate import get_translation_dict_correction, translate_report
|
|
from utilities.convert import (
|
|
convert_df_no_header,
|
|
convert_df,
|
|
convert_json,
|
|
convert_list_phenogenius,
|
|
convert_pdf_to_text,
|
|
)
|
|
from utilities.extract_hpo import add_biometrics, extract_hpo
|
|
from utilities.get_model import get_models, get_nlp_marian
|
|
import gc
|
|
|
|
|
|
def main():
|
|
|
|
print("Code Starting")
|
|
MarianText, _, _ = translate_report(
|
|
Report,
|
|
Last_name,
|
|
First_name,
|
|
nlp_fr,
|
|
marian_fr_en,
|
|
dict_correction,
|
|
dict_abbreviation_correction,
|
|
)
|
|
MarianText_report = reformat_to_report(MarianText, nlp_fr)
|
|
del MarianText
|
|
|
|
print("Translation and De-identification")
|
|
(
|
|
MarianText_anonymize_report_analyze,
|
|
analyzer_results_return,
|
|
_,
|
|
_,
|
|
) = anonymize_analyzer(
|
|
MarianText_report, analyzer, proper_noun, Last_name, First_name
|
|
)
|
|
|
|
print(MarianText_anonymize_report_analyze)
|
|
|
|
MarianText_anonymize_report_engine = anonymize_engine(
|
|
MarianText_report, analyzer_results_return, engine, nlp_fr
|
|
)
|
|
|
|
MarianText_anonymize_report_engine_modif = pd.DataFrame(
|
|
[x for x in MarianText_anonymize_report_engine.split("\n")]
|
|
)
|
|
|
|
MarianText_anonymize_report_engine_df = MarianText_anonymize_report_engine_modif
|
|
with open(
|
|
os.path.join(args.result_dir, "Reports", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_translated_and_deindentified_report.txt",
|
|
"w",
|
|
) as file:
|
|
file.write(
|
|
convert_df_no_header(MarianText_anonymize_report_engine_df).decode("utf-8")
|
|
)
|
|
print(
|
|
"Text file created successfully : "
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_translated_and_deindentified_report.txt"
|
|
)
|
|
|
|
print("Summarization")
|
|
|
|
MarianText_anonymized_reformat_space = add_space_to_comma_endpoint(
|
|
MarianText_anonymize_report_engine, nlp_fr
|
|
)
|
|
MarianText_anonymized_reformat_biometrics, _ = add_biometrics(
|
|
MarianText_anonymized_reformat_space, nlp_fr
|
|
)
|
|
clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics)
|
|
|
|
del MarianText_anonymize_report_engine
|
|
del MarianText_anonymized_reformat_space
|
|
del MarianText_anonymized_reformat_biometrics
|
|
|
|
clinphen_unsafe_check_raw = clinphen_unsafe
|
|
clinphen_unsafe_check_raw["To keep in list"] = False
|
|
clinphen_unsafe_check_raw["Confidence on extraction"] = "low"
|
|
|
|
del clinphen_unsafe
|
|
|
|
clinphen["Confidence on extraction"] = "high"
|
|
clinphen["To keep in list"] = True
|
|
|
|
cols = [
|
|
"HPO ID",
|
|
"Phenotype name",
|
|
"To keep in list",
|
|
"No. occurrences",
|
|
"Earliness (lower = earlier)",
|
|
"Confidence on extraction",
|
|
"Example sentence",
|
|
]
|
|
clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index()
|
|
clinphen_all = clinphen_all[cols]
|
|
|
|
clinphen_df = clinphen_all
|
|
clinphen_df_without_low_confidence = clinphen_df[
|
|
clinphen_df["To keep in list"] == True
|
|
]
|
|
del clinphen
|
|
del clinphen_unsafe_check_raw
|
|
gc.collect()
|
|
|
|
with open(
|
|
os.path.join(args.result_dir, "TSV", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_summarized_report.tsv",
|
|
"w",
|
|
) as file:
|
|
file.write(convert_df(clinphen_df).decode("utf-8"))
|
|
print(
|
|
"Tsv file created successfully : "
|
|
+ os.path.join(args.result_dir, "TSV", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_summarized_report.tsv"
|
|
)
|
|
|
|
with open(
|
|
os.path.join(args.result_dir, "JSON", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_summarized_report.json",
|
|
"w",
|
|
) as file:
|
|
file.write(convert_json(clinphen_df_without_low_confidence))
|
|
print(
|
|
"JSON file created successfully : "
|
|
+ os.path.join(args.result_dir, "JSON", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_summarized_report.json"
|
|
)
|
|
|
|
with open(
|
|
os.path.join(args.result_dir, "TXT", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_summarized_report.txt",
|
|
"w",
|
|
) as file:
|
|
file.write(convert_list_phenogenius(clinphen_df_without_low_confidence))
|
|
print(
|
|
"Text file created successfully : "
|
|
+ os.path.join(args.result_dir, "TXT", "")
|
|
+ Report_id
|
|
+ "_"
|
|
+ Last_name
|
|
+ "_"
|
|
+ First_name
|
|
+ "_summarized_report.txt"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
print("Welcome to the Clinfly app")
|
|
|
|
parser = argparse.ArgumentParser(description="Description of clinfly arguments")
|
|
parser.add_argument(
|
|
"--file",
|
|
type=str,
|
|
help="the input file which contains the visits informations",
|
|
required=True,
|
|
)
|
|
parser.add_argument(
|
|
"--language",
|
|
choices=["fr", "es", "de"],
|
|
type=str,
|
|
help="The language of the input : fr, es , de",
|
|
required=True,
|
|
)
|
|
parser.add_argument(
|
|
"--model_dir",
|
|
default=os.path.expanduser("~"),
|
|
type=str,
|
|
help="The directory where the models will be downloaded.",
|
|
)
|
|
parser.add_argument(
|
|
"--result_dir",
|
|
default="Results",
|
|
type=str,
|
|
help="The directory where the results will be placed.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.exists(args.model_dir):
|
|
os.makedirs(args.model_dir)
|
|
|
|
if not os.path.exists(args.result_dir):
|
|
os.makedirs(args.result_dir)
|
|
|
|
if not os.path.exists(os.path.join(args.result_dir, "Reports")):
|
|
os.makedirs(os.path.join(args.result_dir, "Reports"))
|
|
|
|
if not os.path.exists(os.path.join(args.result_dir, "TSV")):
|
|
os.makedirs(os.path.join(args.result_dir, "TSV"))
|
|
|
|
if not os.path.exists(os.path.join(args.result_dir, "JSON")):
|
|
os.makedirs(os.path.join(args.result_dir, "JSON"))
|
|
|
|
if not os.path.exists(os.path.join(args.result_dir, "TXT")):
|
|
os.makedirs(os.path.join(args.result_dir, "TXT"))
|
|
|
|
print("Language chosen :", args.language)
|
|
models_status = get_models(args.language, args.model_dir)
|
|
dict_correction = get_translation_dict_correction()
|
|
dict_abbreviation_correction = get_abbreviation_dict_correction()
|
|
proper_noun = get_list_not_deidentify()
|
|
cities_list = get_cities_list()
|
|
analyzer, engine = config_deidentify(cities_list)
|
|
nlp_fr, marian_fr_en = get_nlp_marian(args.language)
|
|
|
|
file_name = args.file
|
|
Report_id: str
|
|
Last_name: str
|
|
First_name: str
|
|
Report: str
|
|
|
|
if os.path.isfile(args.file):
|
|
with open(file_name, 'r') as fichier:
|
|
for ligne in fichier:
|
|
elements = ligne.strip().split('\t')
|
|
Report_id, Last_name, First_name, text_or_link = elements
|
|
print("Report_id:", Report_id)
|
|
print("Last_name:", Last_name)
|
|
print("First_name:", First_name)
|
|
if os.path.exists(text_or_link):
|
|
if text_or_link.lower().endswith('.pdf'):
|
|
print(f"Processing PDF file: {text_or_link}")
|
|
Report = convert_pdf_to_text(text_or_link)
|
|
else:
|
|
print(f"Unsupported file type. Please provide a link to a PDF files.")
|
|
else:
|
|
Report = text_or_link
|
|
print("Report:", Report)
|
|
main()
|
|
print()
|
|
else:
|
|
print("Input is not a file. Please provide a valid input.")
|
|
|