Spaces:

kyauy
/

ClinFly

Sleeping

ClinFly / clinfly_app_cli.py

GERNET Enody

Add files via upload

ae03dcd unverified over 1 year ago

8.42 kB

	import csv
	import os
	import argparse
	import pandas as pd
	from utilities.anonymize import (
	get_cities_list,
	get_abbreviation_dict_correction,
	reformat_to_report,
	anonymize_analyzer,
	anonymize_engine,
	add_space_to_comma_endpoint,
	get_list_not_deidentify,
	config_deidentify,
	)
	from utilities.translate import get_translation_dict_correction, translate_report
	from utilities.convert import (
	convert_df_no_header,
	convert_df,
	convert_json,
	convert_list_phenogenius,
	convert_pdf_to_text,
	)
	from utilities.extract_hpo import add_biometrics, extract_hpo
	from utilities.get_model import get_models, get_nlp_marian
	import gc


	def main():

	print("Code Starting")
	MarianText, _, _ = translate_report(
	Report,
	Last_name,
	First_name,
	nlp_fr,
	marian_fr_en,
	dict_correction,
	dict_abbreviation_correction,
	)
	MarianText_report = reformat_to_report(MarianText, nlp_fr)
	del MarianText

	print("Translation and De-identification")
	(
	MarianText_anonymize_report_analyze,
	analyzer_results_return,
	_,
	_,
	) = anonymize_analyzer(
	MarianText_report, analyzer, proper_noun, Last_name, First_name
	)

	print(MarianText_anonymize_report_analyze)

	MarianText_anonymize_report_engine = anonymize_engine(
	MarianText_report, analyzer_results_return, engine, nlp_fr
	)

	MarianText_anonymize_report_engine_modif = pd.DataFrame(
	[x for x in MarianText_anonymize_report_engine.split("\n")]
	)

	MarianText_anonymize_report_engine_df = MarianText_anonymize_report_engine_modif
	with open(
	os.path.join(args.result_dir, "Reports", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_translated_and_deindentified_report.txt",
	"w",
	) as file:
	file.write(
	convert_df_no_header(MarianText_anonymize_report_engine_df).decode("utf-8")
	)
	print(
	"Text file created successfully : "
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_translated_and_deindentified_report.txt"
	)

	print("Summarization")

	MarianText_anonymized_reformat_space = add_space_to_comma_endpoint(
	MarianText_anonymize_report_engine, nlp_fr
	)
	MarianText_anonymized_reformat_biometrics, _ = add_biometrics(
	MarianText_anonymized_reformat_space, nlp_fr
	)
	clinphen, clinphen_unsafe = extract_hpo(MarianText_anonymized_reformat_biometrics)

	del MarianText_anonymize_report_engine
	del MarianText_anonymized_reformat_space
	del MarianText_anonymized_reformat_biometrics

	clinphen_unsafe_check_raw = clinphen_unsafe
	clinphen_unsafe_check_raw["To keep in list"] = False
	clinphen_unsafe_check_raw["Confidence on extraction"] = "low"

	del clinphen_unsafe

	clinphen["Confidence on extraction"] = "high"
	clinphen["To keep in list"] = True

	cols = [
	"HPO ID",
	"Phenotype name",
	"To keep in list",
	"No. occurrences",
	"Earliness (lower = earlier)",
	"Confidence on extraction",
	"Example sentence",
	]
	clinphen_all = pd.concat([clinphen, clinphen_unsafe_check_raw]).reset_index()
	clinphen_all = clinphen_all[cols]

	clinphen_df = clinphen_all
	clinphen_df_without_low_confidence = clinphen_df[
	clinphen_df["To keep in list"] == True
	]
	del clinphen
	del clinphen_unsafe_check_raw
	gc.collect()

	with open(
	os.path.join(args.result_dir, "TSV", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_summarized_report.tsv",
	"w",
	) as file:
	file.write(convert_df(clinphen_df).decode("utf-8"))
	print(
	"Tsv file created successfully : "
	+ os.path.join(args.result_dir, "TSV", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_summarized_report.tsv"
	)

	with open(
	os.path.join(args.result_dir, "JSON", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_summarized_report.json",
	"w",
	) as file:
	file.write(convert_json(clinphen_df_without_low_confidence))
	print(
	"JSON file created successfully : "
	+ os.path.join(args.result_dir, "JSON", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_summarized_report.json"
	)

	with open(
	os.path.join(args.result_dir, "TXT", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_summarized_report.txt",
	"w",
	) as file:
	file.write(convert_list_phenogenius(clinphen_df_without_low_confidence))
	print(
	"Text file created successfully : "
	+ os.path.join(args.result_dir, "TXT", "")
	+ Report_id
	+ "_"
	+ Last_name
	+ "_"
	+ First_name
	+ "_summarized_report.txt"
	)


	if __name__ == "__main__":

	print("Welcome to the Clinfly app")

	parser = argparse.ArgumentParser(description="Description of clinfly arguments")
	parser.add_argument(
	"--file",
	type=str,
	help="the input file which contains the visits informations",
	required=True,
	)
	parser.add_argument(
	"--language",
	choices=["fr", "es", "de"],
	type=str,
	help="The language of the input : fr, es , de",
	required=True,
	)
	parser.add_argument(
	"--model_dir",
	default=os.path.expanduser("~"),
	type=str,
	help="The directory where the models will be downloaded.",
	)
	parser.add_argument(
	"--result_dir",
	default="Results",
	type=str,
	help="The directory where the results will be placed.",
	)

	args = parser.parse_args()

	if not os.path.exists(args.model_dir):
	os.makedirs(args.model_dir)

	if not os.path.exists(args.result_dir):
	os.makedirs(args.result_dir)

	if not os.path.exists(os.path.join(args.result_dir, "Reports")):
	os.makedirs(os.path.join(args.result_dir, "Reports"))

	if not os.path.exists(os.path.join(args.result_dir, "TSV")):
	os.makedirs(os.path.join(args.result_dir, "TSV"))

	if not os.path.exists(os.path.join(args.result_dir, "JSON")):
	os.makedirs(os.path.join(args.result_dir, "JSON"))

	if not os.path.exists(os.path.join(args.result_dir, "TXT")):
	os.makedirs(os.path.join(args.result_dir, "TXT"))

	print("Language chosen :", args.language)
	models_status = get_models(args.language, args.model_dir)
	dict_correction = get_translation_dict_correction()
	dict_abbreviation_correction = get_abbreviation_dict_correction()
	proper_noun = get_list_not_deidentify()
	cities_list = get_cities_list()
	analyzer, engine = config_deidentify(cities_list)
	nlp_fr, marian_fr_en = get_nlp_marian(args.language)

	file_name = args.file
	Report_id: str
	Last_name: str
	First_name: str
	Report: str

	if os.path.isfile(args.file):
	with open(file_name, 'r') as fichier:
	for ligne in fichier:
	elements = ligne.strip().split('\t')
	Report_id, Last_name, First_name, text_or_link = elements
	print("Report_id:", Report_id)
	print("Last_name:", Last_name)
	print("First_name:", First_name)
	if os.path.exists(text_or_link):
	if text_or_link.lower().endswith('.pdf'):
	print(f"Processing PDF file: {text_or_link}")
	Report = convert_pdf_to_text(text_or_link)
	else:
	print(f"Unsupported file type. Please provide a link to a PDF files.")
	else:
	Report = text_or_link
	print("Report:", Report)
	main()
	print()
	else:
	print("Input is not a file. Please provide a valid input.")