ClinFly / clinfly_app_cli.py
GERNET Enody
Add files via upload
ae03dcd unverified
import csv
import os
import argparse
import pandas as pd
from utilities.anonymize import (
get_cities_list,
get_abbreviation_dict_correction,
reformat_to_report,
anonymize_analyzer,
anonymize_engine,
add_space_to_comma_endpoint,
get_list_not_deidentify,
config_deidentify,
)
from utilities.translate import get_translation_dict_correction, translate_report
from utilities.convert import (
convert_df_no_header,
convert_df,
convert_json,
convert_list_phenogenius,
convert_pdf_to_text,
)
from utilities.extract_hpo import add_biometrics, extract_hpo
from utilities.get_model import get_models, get_nlp_marian
import gc
def main():
print("Code Starting")
MarianText, _, _ = translate_report(
Report,
Last_name,
First_name,
nlp_fr,
marian_fr_en,
dict_correction,
dict_abbreviation_correction,
)
MarianText_report = reformat_to_report(MarianText, nlp_fr)
del MarianText
print("Translation and De-identification")
(
MarianText_anonymize_report_analyze,
analyzer_results_return,
_,
_,
) = anonymize_analyzer(
MarianText_report, analyzer, proper_noun, Last_name, First_name
)
print(MarianText_anonymize_report_analyze)
MarianText_anonymize_report_engine = anonymize_engine(
MarianText_report, analyzer_results_return, engine, nlp_fr
)
MarianText_anonymize_report_engine_modif = pd.DataFrame(
[x for x in MarianText_anonymize_report_engine.split("\n")]
)
MarianText_anonymize_report_engine_df = MarianText_anonymize_report_engine_modif
with open(
os.path.join(args.result_dir, "Reports", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_translated_and_deindentified_report.txt",
"w",
) as file:
file.write(
convert_df_no_header(MarianText_anonymize_report_engine_df).decode("utf-8")
)
print(
"Text file created successfully : "
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_translated_and_deindentified_report.txt"
)
print("Summarization")
MarianText_anonymized_reformat_space = add_space_to_comma_endpoint(
MarianText_anonymize_report_engine, nlp_fr
)
MarianText_anonymized_reformat_biometrics, _ = add_biometrics(
MarianText_anonymized_reformat_space, nlp_fr
)
clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics)
del MarianText_anonymize_report_engine
del MarianText_anonymized_reformat_space
del MarianText_anonymized_reformat_biometrics
clinphen_unsafe_check_raw = clinphen_unsafe
clinphen_unsafe_check_raw["To keep in list"] = False
clinphen_unsafe_check_raw["Confidence on extraction"] = "low"
del clinphen_unsafe
clinphen["Confidence on extraction"] = "high"
clinphen["To keep in list"] = True
cols = [
"HPO ID",
"Phenotype name",
"To keep in list",
"No. occurrences",
"Earliness (lower = earlier)",
"Confidence on extraction",
"Example sentence",
]
clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index()
clinphen_all = clinphen_all[cols]
clinphen_df = clinphen_all
clinphen_df_without_low_confidence = clinphen_df[
clinphen_df["To keep in list"] == True
]
del clinphen
del clinphen_unsafe_check_raw
gc.collect()
with open(
os.path.join(args.result_dir, "TSV", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_summarized_report.tsv",
"w",
) as file:
file.write(convert_df(clinphen_df).decode("utf-8"))
print(
"Tsv file created successfully : "
+ os.path.join(args.result_dir, "TSV", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_summarized_report.tsv"
)
with open(
os.path.join(args.result_dir, "JSON", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_summarized_report.json",
"w",
) as file:
file.write(convert_json(clinphen_df_without_low_confidence))
print(
"JSON file created successfully : "
+ os.path.join(args.result_dir, "JSON", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_summarized_report.json"
)
with open(
os.path.join(args.result_dir, "TXT", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_summarized_report.txt",
"w",
) as file:
file.write(convert_list_phenogenius(clinphen_df_without_low_confidence))
print(
"Text file created successfully : "
+ os.path.join(args.result_dir, "TXT", "")
+ Report_id
+ "_"
+ Last_name
+ "_"
+ First_name
+ "_summarized_report.txt"
)
if __name__ == "__main__":
print("Welcome to the Clinfly app")
parser = argparse.ArgumentParser(description="Description of clinfly arguments")
parser.add_argument(
"--file",
type=str,
help="the input file which contains the visits informations",
required=True,
)
parser.add_argument(
"--language",
choices=["fr", "es", "de"],
type=str,
help="The language of the input : fr, es , de",
required=True,
)
parser.add_argument(
"--model_dir",
default=os.path.expanduser("~"),
type=str,
help="The directory where the models will be downloaded.",
)
parser.add_argument(
"--result_dir",
default="Results",
type=str,
help="The directory where the results will be placed.",
)
args = parser.parse_args()
if not os.path.exists(args.model_dir):
os.makedirs(args.model_dir)
if not os.path.exists(args.result_dir):
os.makedirs(args.result_dir)
if not os.path.exists(os.path.join(args.result_dir, "Reports")):
os.makedirs(os.path.join(args.result_dir, "Reports"))
if not os.path.exists(os.path.join(args.result_dir, "TSV")):
os.makedirs(os.path.join(args.result_dir, "TSV"))
if not os.path.exists(os.path.join(args.result_dir, "JSON")):
os.makedirs(os.path.join(args.result_dir, "JSON"))
if not os.path.exists(os.path.join(args.result_dir, "TXT")):
os.makedirs(os.path.join(args.result_dir, "TXT"))
print("Language chosen :", args.language)
models_status = get_models(args.language, args.model_dir)
dict_correction = get_translation_dict_correction()
dict_abbreviation_correction = get_abbreviation_dict_correction()
proper_noun = get_list_not_deidentify()
cities_list = get_cities_list()
analyzer, engine = config_deidentify(cities_list)
nlp_fr, marian_fr_en = get_nlp_marian(args.language)
file_name = args.file
Report_id: str
Last_name: str
First_name: str
Report: str
if os.path.isfile(args.file):
with open(file_name, 'r') as fichier:
for ligne in fichier:
elements = ligne.strip().split('\t')
Report_id, Last_name, First_name, text_or_link = elements
print("Report_id:", Report_id)
print("Last_name:", Last_name)
print("First_name:", First_name)
if os.path.exists(text_or_link):
if text_or_link.lower().endswith('.pdf'):
print(f"Processing PDF file: {text_or_link}")
Report = convert_pdf_to_text(text_or_link)
else:
print(f"Unsupported file type. Please provide a link to a PDF files.")
else:
Report = text_or_link
print("Report:", Report)
main()
print()
else:
print("Input is not a file. Please provide a valid input.")