import csv import os from io import BytesIO from string import Template import pandas as pd import matplotlib.pyplot as plt import networkx as nx from PIL import Image MAX_INPUT_TOKEN_LENGTH = 1024 TEMPERATURE = 0.1 MAX_NEW_TOKENS = 8192 def remove_chars_loop(text, chars_to_remove): for char in chars_to_remove: text = text.replace(char, "") return text def get_prompt_with_files_uploaded(filepaths: list[str] = None) -> str: if not filepaths: return "No files uploaded yet." prompt_begin = """ You are a data structuring expert tasked with analyzing data files (CSV, TXT, TSV, XML) to identify their schema and generate output in the Gen3 Data Dictionary format (JSON). Review the data files for column names, data types, and relationships, and if a data dictionary is provided, ensure strict alignment with its metadata. Column names may have embedded information to infer the type and/or units from. Follow these steps: - Examine each data file to define the schema - Cross-reference with the data dictionary, if available, to match all column definitions and metadata exactly - Generate an output schema that mirrors the provided data structure WITHOUT adding any new entities or attributes - Limit your output to the smallest amount possible of JSON to capture the necessary information. DO NOT BE VERBOSE The output must include nodes, properties of those nodes, descriptions of those properties, and links to other nodes. The ouput must format as ONLY JSON, do not include additional text and please be concise. Limit your output to only what's necessary (nodes, properties, descriptions, relationships / links). """ file_template = Template( """ File name: `$file_name` File contents: ``` $file_contents``` """ ) prompt_end = """ Please generate the Gen3 Data Dictionary in JSON format: """ # Start prompt prompt = prompt_begin for path in filepaths: file_name = os.path.basename(path) with open(path, "r", encoding="utf-8") as f: reader = csv.DictReader(f, delimiter="\t") file_contents = "\t".join(reader.fieldnames) prompt += file_template.substitute( file_name=file_name, file_contents=file_contents ) prompt += prompt_end print(f"prompt: {prompt}") return prompt def create_graph_image_from_json(json_response): adj_dict = {} if isinstance(json_response, dict) and "nodes" in json_response: for node in json_response.get("nodes"): adj_dict[node["name"]] = node["links"] G = nx.from_dict_of_lists(adj_dict) fig, ax = plt.subplots() nx.draw_networkx(G, with_labels=True, node_color="lightblue", ax=ax) buf = BytesIO() fig.savefig(buf, format="png") plt.close(fig) buf.seek(0) pil_img = Image.open(buf) return pil_img def create_summary_tables(json_response): node_descriptions = {} node_property_descriptions = {} for node in json_response.get("nodes", []): node_descriptions[node.get("name", "N/A")] = node.get("description", "N/A") for prop in node.get("properties", []): node_property_descriptions[ f"{node.get('name', 'N/A')}.{prop.get('name', 'N/A')}" ] = [prop.get("type", "N/A"), prop.get("description", "N/A")] node_descriptions_df = pd.DataFrame.from_dict( node_descriptions, orient="index" ).reset_index() node_descriptions_df.rename( columns={"index": "Node Name", 0: "Generated Description"}, inplace=True ) node_property_descriptions_df = pd.DataFrame.from_dict( node_property_descriptions, orient="index" ).reset_index() node_property_descriptions_df.rename( columns={ "index": "Node.Property Name", 0: "Generated Data Type", 1: "Generated Description", }, inplace=True, ) return node_descriptions_df, node_property_descriptions_df def get_example_ai_model_output_simple(): return """ { "nodes": [ { "name": "project", "description": "Any specifically defined piece of work that is undertaken or attempted to meet a single requirement. (NCIt C47885)", "links": [], "required": [ "dbgap_accession_number", "project.id" ], "properties": [ { "name": "awg_review", "description": "Indicates that the project is an AWG project.", "type": "boolean" }, { "name": "data_citation", "description": "The citation for the published dataset.", "type": "string" }, { "name": "data_contributor", "description": "The name of the organization or individual that the contributed dataset belongs to.", "type": "string" }, { "name": "data_description", "description": "A brief, free-text description of the data files and associated metadata provided for this dataset.", "type": "string" }, { "name": "dbgap_accession_number", "description": "The dbgap accession number provided for the project.", "type": "string" }, { "name": "in_review", "description": "Indicates that the project is under review by the submitter. Upload and data modification is disabled.", "type": "boolean" }, { "name": "intended_release_date", "description": "Tracks a Project's intended release date.", "type": "string" }, { "name": "project.id", "description": "A unique identifier for records in this 'project' table.", "type": "string" }, { "name": "protocol_number", "description": "The project's protocol number or similar amount.", "type": "string" }, { "name": "releasable", "description": "A project can only be released by the user when `releasable` is true.", "type": "boolean" }, { "name": "request_submission", "description": "Indicates that the user has requested submission to the GDC for harmonization.", "type": "boolean" }, { "name": "research_design", "description": "A summary of the goals of the research or a general description of the research's relationship to a clinical application.", "type": "string" }, { "name": "research_objective", "description": "The general objective of the research; what the researchers hope to discover or determine.", "type": "string" }, { "name": "research_setup", "description": "A high level description of the setup used to achieve the research objectives.", "type": "string" } ] }, { "name": "dataset", "description": "A set of metadata and associated data file objects originating from single a research study, clinical trial or patient cohort.", "links": [ "project" ], "required": [ "dataset.id", "project.id" ], "properties": [ { "name": "Class_of_Case_Desc", "description": "The text term used to describe the kind of clinical condition that can be defined based on objective criteria or by including all patient information from the case.", "type": "string" }, { "name": "data_citation", "description": "The citation for the published dataset.", "type": "string" }, { "name": "full_name", "description": "The full name or title of the dataset or publication.", "type": "string" }, { "name": "longitudinal", "description": "Indicates whether the dataset has longitudinal or time-series data.", "type": "boolean" }, { "name": "project.id", "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'dataset' table.", "type": "string" }, { "name": "dataset.id", "description": "A unique identifier for records in this 'dataset' table.", "type": "string" } ] }, { "name": "subject", "description": "The collection of all data related to a specific subject in the context of a specific experiment.", "links": [ "project", "dataset" ], "required": [ "dataset.id", "subject.id" ], "properties": [ { "name": "date_of_death", "description": "The date of death of the subject in the context of a specific experiment.", "type": "string" }, { "name": "dataset.id", "description": "Unique identifiers for records in the 'dataset' table that relate via this foreign key to records in this'subject' table.", "type": "string" }, { "name": "subject.id", "description": "A unique identifier for records in this'subject' table.", "type": "string" }, { "name": "CancerRegistry_PatientID", "description": "The patient unique id in the case registry.", "type": "string" }, { "name": "Ethnicity", "description": "An individual's self-described social and cultural grouping, specifically whether an individual describes themselves as Hispanic or Latino. The provided values are based on the categories defined by the U.S. Office of Management and Business and used by the U.S. Census Bureau.", "type": "string" }, { "name": "Last_Name", "description": "The surname(s) of individual(s) in study, in the form used for cultural or ethnic reasons (e.g., Spanish surnames)", "type": "string" }, { "name": "Sex_Desc", "description": "The description of the individual's gender.", "type": "string" }, { "name": "project.id", "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this'subject' table.", "type": "string" } ] }, { "name": "sample", "description": "Any material sample taken from a biological entity for testing, diagnostic, propagation, treatment or research purposes, including a sample obtained from a living organism or taken from the biological object after halting of all its life functions. Biospecimen can contain one or more components including but not limited to cellular molecules, cells, tissues, organs, body fluids, embryos, and body excretory products.", "links": [ "subject" ], "required": [ "sample.id", "subject.id" ], "properties": [ { "name": "body_fluid_code", "description": "The code for the body fluid from which the sample was taken.", "type": "string" }, { "name": "procedure_date", "description": "Year the sample was taken for analysis.", "type": "integer" }, { "name": "subject.id", "description": "Unique identifiers for records in the'subject' table that relate via this foreign key to records in this'sample' table.", "type": "string" }, { "name": "sample.id", "description": "A unique identifier for records in this'sample' table.", "type": "string" } ] } ] } """ def get_example_ai_model_output_many(): return """ { "nodes": [ { "name": "project", "description": "Any specifically defined piece of work that is undertaken or attempted to meet a single requirement. (NCIt C47885)", "links": [], "required": [ "code", "dbgap_accession_number", "name", "project.id" ], "properties": [ { "name": "category", "description": "The nature of the investigation or investigational use for which clinical study information is being submitted.", "type": "enum" }, { "name": "code", "description": "Unique identifier for the project.", "type": "string" }, { "name": "collaborators", "description": "Other organizations (if any) providing support. Support may include funding, design, implementation, data analysis or reporting. The responsible party is responsible for confirming all collaborators before listing them.", "type": "string" }, { "name": "dbgap_accession_number", "description": "The dbgap accession number provided for the project.", "type": "string" }, { "name": "investigator_name", "description": "Name of the principal investigator for the project.", "type": "string" }, { "name": "name", "description": "Display name/brief description for the project.", "type": "string" }, { "name": "publisher", "description": "An entity responsible for making the resource available. Examples of a Publisher include a person, an organization, or a service. Typically, the name of a Publisher should be used to indicate the entity.", "type": "string" }, { "name": "released", "description": "To release a project is to tell the GDC to include all submitted entities in the next GDC index.", "type": "boolean" }, { "name": "study_design_allocation", "description": "The method by which participants are assigned to arms in a clinical trial.", "type": "enum" }, { "name": "title", "description": "The title of the clinical study, corresponding to the title of the protocol.", "type": "string" }, { "name": "project.id", "description": "A unique identifier for records in this 'project' table.", "type": "string" }, { "name": "verification_date", "description": "The date on which the responsible party last verified the clinical study information in the entire ClinicalTrials.gov record for the clinical study, even if no additional or updated information is being submitted.", "type": "string" } ] }, { "name": "center", "description": "Genetic Research Center (GRC) or other clinical center at which research participants are recruited.", "links": [ "project" ], "required": [ "name", "project.id", "center.id" ], "properties": [ { "name": "name", "description": "Name of center at which participants were recruited and/or at which data were collected.", "type": "string" }, { "name": "project.id", "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'center' table.", "type": "string" }, { "name": "center.id", "description": "A unique identifier for records in this 'center' table.", "type": "string" } ] }, { "name": "participant", "description": "The collection of all data related to a specific subject in the context of a specific project.", "links": [ "project", "center" ], "required": [ "participant.id", "project.id" ], "properties": [ { "name": "initials", "description": "The participant's initials.", "type": "string" }, { "name": "project.id", "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'participant' table.", "type": "string" }, { "name": "participant.id", "description": "A unique identifier for records in this 'participant' table.", "type": "string" }, { "name": "consent_codes", "description": "", "type": "array" }, { "name": "consented_for_data_sharing", "description": "The participant has consented to share their data.", "type": "boolean" }, { "name": "consortium_id_of_affected_spouse", "description": "TBD", "type": "integer" }, { "name": "mothers_consortium_id", "description": "TBD", "type": "integer" }, { "name": "center.id", "description": "Unique identifiers for records in the 'center' table that relate via this foreign key to records in this 'participant' table.", "type": "string" } ] }, { "name": "summary_file", "description": "A summary of the data file, including the number of rows, columns, and data types.", "links": [ "center" ], "required": [ "data_format", "file_size", "center.id", "summary_file.id" ], "properties": [ { "name": "data_format", "description": "Format of the data files.", "type": "enum" }, { "name": "file_size", "description": "The size of the data file (object) in bytes.", "type": "integer" }, { "name": "center.id", "description": "Unique identifiers for records in the 'center' table that relate via this foreign key to records in this'summary_file' table.", "type": "string" }, { "name": "summary_file.id", "description": "A unique identifier for records in this'summary_file' table.", "type": "string" } ] }, { "name": "visit", "description": "A visit by a patient or study participant to a medical professional. A clinical encounter that encompasses planned and unplanned trial interventions, procedures and assessments that may be performed on a participant. A visit has a start and an end, each described with a rule. The process by which information about the health status of an individual is obtained before and after a study has officially closed; an activity that continues something that has already begun or that repeats something that has already been done.", "links": [ "participant" ], "required": [ "visit.id", "participant.id" ], "properties": [ { "name": "age_at_visit", "description": "The study participant's age, in years, at the visit. If the age is greater than 89 years, use the age_at_visit_gt89 property instead.", "type": "number" }, { "name": "bmi", "description": "The body mass divided by the square of the body height expressed in units of kg/m^2.", "type": "number" }, { "name": "days_to_follow_up", "description": "Number of days between the date used for index and the date the patient was seen or contacted at follow-up.", "type": "integer" }, { "name": "ever_transferred", "description": "Participant ever transferred sites (changed ids)", "type": "enum" }, { "name": "harmonized_visit_number", "description": "The derived harmonized visit number for the studies MACS and WIHS.", "type": "integer" }, { "name": "health_insurance", "description": "Currently have any health insurance", "type": "boolean" }, { "name": "review_yr", "description": "Year in which the participant's visit was reviewed", "type": "integer" }, { "name": "visit_date", "description": "Year of the visit.", "type": "integer" }, { "name": "visit_number", "description": "Visit number", "type": "integer" }, { "name": "visit_type", "description": "Define if the visit is a follow-up or the baseline visit.", "type": "enum" }, { "name": "weight", "description": "The weight of the participant measured in grams.", "type": "number" }, { "name": "participant.id", "description": "Unique identifiers for records in the 'participant' table that relate via this foreign key to records in this 'visit' table.", "type": "string" }, { "name": "visit.id", "description": "A unique identifier for records in this 'visit' table.", "type": "string" } ] }, { "name": "alias", "description": "An alias for the subject.", "links": [ "participant" ], "required": [ "participant.id", "alias.id" ], "properties": [ { "name": "participant.id", "description": "Unique identifiers for records in the 'participant' table that relate via this foreign key to records in this 'alias' table.", "type": "string" }, { "name": "alias.id", "description": "A unique identifier for records in this 'alias' table.", "type": "string" } ] }, { "name": "diagnosis", "description": "Data from the investigation, analysis and recognition of the presence and nature of disease, condition, or injury from expressed signs and symptoms; also, the scientific determination of any kind; the concise results of such an investigation.", "links": [ "visit" ], "required": [ "visit.id", "diagnosis.id" ], "properties": [ { "name": "age_at_diagnosis", "description": "The age of the patient at the time of diagnosis.", "type": "number" }, { "name": "age_at_diagnosis_gt89", "description": "Indicates if the age at diagnosis is greater than 89 years.", "type": "enum" }, { "name": "ibd_affection_status", "description": "The IBD Affection Status of the patient.", "type": "enum" }, { "name": "visit.id", "description": "Unique identifiers for records in the 'visit' table that relate via this foreign key to records in this 'diagnosis' table.", "type": "string" }, { "name": "diagnosis.id", "description": "A unique identifier for records in this 'diagnosis' table.", "type": "string" } ] }, { "name": "exposure", "description": "Data related to exposure information.", "links": [ "visit" ], "required": [ "visit.id", "exposure.id" ], "properties": [ { "name": "nocigar_day_unknown", "description": "Unknown", "type": "enum" }, { "name": "smoking", "description": "Smoking", "type": "enum" }, { "name": "smoking_stop", "description": "Smoking stop", "type": "enum" }, { "name": "visit.id", "description": "Unique identifiers for records in the 'visit' table that relate via this foreign key to records in this 'exposure' table.", "type": "string" }, { "name": "exposure.id", "description": "A unique identifier for records in this 'exposure' table.", "type": "string" } ] } ] } """