Spaces:

uc-ctds
/

GDC-QAG

Running on Zero

App Files Files Community

test_pr_1

by aatu18 - opened Jul 25

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+366

-19

Files changed (2) hide show

app.py +257 -19
old_app.py +109 -0

app.py CHANGED Viewed

@@ -1,23 +1,24 @@
 import os
-import json
 import gradio as gr
 import pandas as pd
 import spaces
 import torch
 from methods import gdc_api_calls, utilities
-from gdc_pipeline import execute_pipeline, setup_args, setup_models_and_data
 from transformers import AutoTokenizer, BertTokenizer, AutoModelForCausalLM, BertForSequenceClassification
 working_llama_token = os.environ.get("let_this_please_work", False)
 hf_TOKEN = os.environ.get("fineTest", False)
 intent_token = os.environ.get("query_intent_test", False)
-# setup models and data
-# qag_requirements = setup_models_and_data(hf_TOKEN, working_llama_token, intent_token)
 print("getting gdc project information")
 project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
@@ -50,26 +51,262 @@ model = AutoModelForCausalLM.from_pretrained(
 model = model.to('cuda').eval()
-# question = 'What is the co-occurence frequency of somatic homozygous deletions in CDKN2A and CDKN2B in the mesothelioma project TCGA-MESO in the genomic data commons?'
-def wrapped_execute_pipeline(question: str):
-    df = pd.DataFrame({'questions' : [question]})
-    print(f'Question received: {question}')
     try:
-        result = execute_pipeline(
-            df,
             gdc_genes_mutations,
             model,
             tok,
-            intent_model,
-            intent_tok,
             project_mappings,
-            output_file_prefix=None
         )
-    except Exception as e:
-        result = f'Unable to execute GDC API, can you please retry with a template question? Error: {e}'
     return result
 def visible_component(input_text):
     return gr.update(value="WHATEVER")
@@ -100,10 +337,11 @@ with gr.Blocks(title="GDC QAG MCP server") as GDC_QAG_QUERY:
     )
     search_button.click(
-        fn=wrapped_execute_pipeline,
         inputs=[query_input],
         outputs=output,
     )
 if __name__ == "__main__":
     GDC_QAG_QUERY.launch(mcp_server=True, show_api=True)

 import os
+from types import SimpleNamespace
 import gradio as gr
 import pandas as pd
 import spaces
 import torch
 from methods import gdc_api_calls, utilities
 from transformers import AutoTokenizer, BertTokenizer, AutoModelForCausalLM, BertForSequenceClassification
+from guidance import gen as guidance_gen
+from guidance.models import Transformers
+from transformers import set_seed
+from methods import gdc_api_calls, utilities
+# set up various tokens
 working_llama_token = os.environ.get("let_this_please_work", False)
 hf_TOKEN = os.environ.get("fineTest", False)
 intent_token = os.environ.get("query_intent_test", False)
+# set up requirements: models and data
 print("getting gdc project information")
 project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
 model = model.to('cuda').eval()
+# execute_api_call
+def execute_api_call(
+    intent,
+    gene_entities,
+    mutation_entities,
+    cancer_entities,
+    query,
+    gdc_genes_mutations,
+    project_mappings,
+):
+    if intent == "ssm_frequency":
+        result, cancer_entities = utilities.get_ssm_frequency(
+            gene_entities, mutation_entities, cancer_entities, project_mappings
+        )
+    elif intent == "top_mutated_genes_by_project":
+        result = gdc_api_calls.get_top_mutated_genes_by_project(
+            cancer_entities, top_k=10
+        )
+    elif intent == "most_frequently_mutated_gene":
+        result = gdc_api_calls.get_top_mutated_genes_by_project(
+            cancer_entities, top_k=1
+        )
+    elif intent == "freq_cnv_loss_or_gain":
+        result, cancer_entities = gdc_api_calls.get_freq_cnv_loss_or_gain(
+            gene_entities, cancer_entities, query, cnv_and_ssm_flag=False
+        )
+    elif intent == "msi_h_frequency":
+        result, cancer_entities = gdc_api_calls.get_msi_frequency(cancer_entities)
+    elif intent == "cnv_and_ssm":
+        result, cancer_entities = utilities.get_freq_of_cnv_and_ssms(
+            query, cancer_entities, gene_entities, gdc_genes_mutations
+        )
+    elif intent == "top_cases_counts_by_gene":
+        result, cancer_entities = gdc_api_calls.get_top_cases_counts_by_gene(
+            gene_entities, cancer_entities
+        )
+    elif intent == "project_summary":
+        result = gdc_api_calls.get_project_summary(cancer_entities)
+    else:
+        result = "user intent not recognized, or use case not covered"
+    return result, cancer_entities
+# function to combine entities, intent and API call
+def construct_and_execute_api_call(
+    query, gdc_genes_mutations, project_mappings, intent_model, intent_tok
+):
+    print("query:\n{}\n".format(query))
+    # Infer entities
+    initial_cancer_entities = utilities.return_initial_cancer_entities(
+        query, model="en_ner_bc5cdr_md"
+    )
+    if not initial_cancer_entities:
+        try:
+            initial_cancer_entities = utilities.return_initial_cancer_entities(
+                query, model="en_core_sci_md"
+            )
+        except Exception as e:
+            print("unable to guess cancer entities {}".format(str(e)))
+            initial_cancer_entities = []
+    cancer_entities = utilities.postprocess_cancer_entities(
+        project_mappings, initial_cancer_entities=initial_cancer_entities, query=query
+    )
+    # if cancer entities is empty from above methods return all projects
+    if not cancer_entities:
+        cancer_entities = list(project_mappings.keys())
+    gene_entities = utilities.infer_gene_entities_from_query(query, gdc_genes_mutations)
+    mutation_entities = utilities.infer_mutation_entities(
+        gene_entities=gene_entities,
+        query=query,
+        gdc_genes_mutations=gdc_genes_mutations,
+    )
+    print("gene entities {}".format(gene_entities))
+    print("mutation entities {}".format(mutation_entities))
+    print("cancer entities {}".format(cancer_entities))
+    # infer user intent
+    intent = utilities.infer_user_intent(query, intent_model, intent_tok)
+    print("user intent:\n{}\n".format(intent))
     try:
+        api_call_result, cancer_entities = execute_api_call(
+            intent,
+            gene_entities,
+            mutation_entities,
+            cancer_entities,
+            query,
             gdc_genes_mutations,
+            project_mappings,
+        )
+        print("api_call_result {}".format(api_call_result))
+    except Exception as e:
+        print("unable to process query {} {}".format(query, str(e)))
+        api_call_result = []
+        cancer_entities = []
+    return SimpleNamespace(
+        helper_output=api_call_result,
+        cancer_entities=cancer_entities,
+        intent=intent,
+        gene_entities=gene_entities,
+        mutation_entities=mutation_entities,
+    )
+# generate llama model response
+@spaces.GPU(duration=30)
+def generate_response(modified_query, model, tok):
+    set_seed(1042)
+    regex = "The final answer is: \d*\.\d*%"
+    lm = Transformers(model=model, tokenizer=tok)
+    lm += modified_query
+    print(f"lm: {lm}")
+    lm += guidance_gen(
+        "gen_response",
+        n=1,
+        temperature=0,
+        max_tokens=1000,
+        regex=regex
+    )
+    print(f"lm with response: {lm}")
+    return lm["gen_response"]
+def batch_test(
+    query,
+    model,
+    tok,
+    gdc_genes_mutations,
+    project_mappings,
+    intent_model,
+    intent_tok
+):
+    modified_query = utilities.construct_modified_query_base_llm(query)
+    print(f"modified_query is: {modified_query}")
+    llama_base_output = generate_response(modified_query, model, tok)
+    print(f"llama_base_output: {llama_base_output}")
+    try:
+        result = construct_and_execute_api_call(
+            query, gdc_genes_mutations, project_mappings, intent_model, intent_tok
+        )
+    except Exception as e:
+        # unable to compute at this time, recheck
+        result.helper_output = []
+        result.cancer_entities = []
+    # if there is not a helper output for each unique cancer entity
+    # log error to inspect and reprocess query later
+    try:
+        len(result.helper_output) == len(result.cancer_entities)
+    except Exception as e:
+        msg = "there is not a unique helper output for each unique \
+    cancer entity in {}".format(
+            query
+        )
+        print("exception {}".format(msg))
+        result.helper_output = []
+        result.cancer_entities = []
+    return pd.Series(
+        [
+            llama_base_output,
+            result.helper_output,
+            result.cancer_entities,
+            result.intent,
+            result.gene_entities,
+            result.mutation_entities,
+        ]
+    )
+def get_prefinal_response(row, model, tok):
+    try:
+        query = row["questions"]
+        helper_output = row["helper_output"]
+    except Exception as e:
+        print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
+    modified_query = utilities.construct_modified_query(query, helper_output)
+    prefinal_llama_with_helper_output = generate_response(modified_query, model, tok)
+    return pd.Series([modified_query, prefinal_llama_with_helper_output])
+def execute_pipeline(question: str):
+    df = pd.DataFrame({'questions' : [question]})
+    print(f'Question received: {question}')
+    print("starting pipeline")
+    print("CUDA available:", torch.cuda.is_available())
+    print("CUDA device name:", torch.cuda.get_device_name(0))
+    # queries input file
+    print(f"running test on input {df}")
+    df[
+        [
+            "llama_base_output",
+            "helper_output",
+            "cancer_entities",
+            "intent",
+            "gene_entities",
+            "mutation_entities",
+        ]
+    ] = df["questions"].apply(
+        lambda x: batch_test(
+            x,
             model,
             tok,
+            gdc_genes_mutations,
             project_mappings,
+            intent_model,
+            intent_tok
         )
+    )
+    # retain responses with helper output
+    df["len_helper"] = df["helper_output"].apply(lambda x: len(x))
+    df_filtered = df[df["len_helper"] != 0]
+    df_filtered["len_ce"] = df_filtered["cancer_entities"].apply(lambda x: len(x))
+    # retain rows where one response is retrieved for each cancer entity
+    df_filtered["ce_eq_helper"] = df_filtered.apply(
+        lambda x: x["len_ce"] == x["len_helper"], axis=1
+    )
+    df_filtered = df_filtered[df_filtered["ce_eq_helper"]]
+    df_filtered_exploded = df_filtered.explode(
+        ["helper_output", "cancer_entities"], ignore_index=True
+    )
+    df_filtered_exploded[["modified_prompt", "pre_final_llama_with_helper_output"]] = (
+        df_filtered_exploded.apply(
+            lambda x: get_prefinal_response(x, model, tok), axis=1
+        )
+    )
+    ### postprocess response
+    print("postprocessing response")
+    df_filtered_exploded[
+        [
+            "llama_base_stat",
+            "delta_llama",
+            "value_changed",
+            "ground_truth_stat",
+            "generated_stat_prefinal",
+            "delta_prefinal",
+            "generated_stat_final",
+            "delta_final",
+            "final_response",
+        ]
+    ] = df_filtered_exploded.apply(
+        lambda x: utilities.postprocess_response(x), axis=1
+    )
+    final_columns = utilities.get_final_columns()
+    result = df_filtered_exploded[final_columns].T
+    print('result {}'.format(result))
+    print('completed')
     return result
 def visible_component(input_text):
     return gr.update(value="WHATEVER")
     )
     search_button.click(
+        fn=execute_pipeline,
         inputs=[query_input],
         outputs=output,
     )
 if __name__ == "__main__":
     GDC_QAG_QUERY.launch(mcp_server=True, show_api=True)

old_app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import json
+import gradio as gr
+import pandas as pd
+import spaces
+import torch
+from methods import gdc_api_calls, utilities
+from gdc_pipeline import execute_pipeline, setup_args, setup_models_and_data
+from transformers import AutoTokenizer, BertTokenizer, AutoModelForCausalLM, BertForSequenceClassification
+working_llama_token = os.environ.get("let_this_please_work", False)
+hf_TOKEN = os.environ.get("fineTest", False)
+intent_token = os.environ.get("query_intent_test", False)
+# setup models and data
+# qag_requirements = setup_models_and_data(hf_TOKEN, working_llama_token, intent_token)
+print("getting gdc project information")
+project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
+print('loading intent model')
+model_id = 'uc-ctds/query_intent'
+intent_tok = AutoTokenizer.from_pretrained(
+    model_id, trust_remote_code=True,
+    token=intent_token
+)
+intent_model = BertForSequenceClassification.from_pretrained(
+    model_id, token=intent_token)
+intent_model = intent_model.to('cuda').eval()
+print("loading gdc genes and mutations")
+gdc_genes_mutations = utilities.load_gdc_genes_mutations_hf(hf_TOKEN)
+print("loading llama-3B model")
+model_id = "meta-llama/Llama-3.2-3B-Instruct"
+tok = AutoTokenizer.from_pretrained(
+    model_id, trust_remote_code=True,
+    token=working_llama_token
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16,
+    trust_remote_code=True,
+    token=working_llama_token
+)
+model = model.to('cuda').eval()
+# question = 'What is the co-occurence frequency of somatic homozygous deletions in CDKN2A and CDKN2B in the mesothelioma project TCGA-MESO in the genomic data commons?'
+def wrapped_execute_pipeline(question: str):
+    df = pd.DataFrame({'questions' : [question]})
+    print(f'Question received: {question}')
+    try:
+        result = execute_pipeline(
+            df,
+            gdc_genes_mutations,
+            model,
+            tok,
+            intent_model,
+            intent_tok,
+            project_mappings,
+            output_file_prefix=None
+        )
+    except Exception as e:
+        result = f'Unable to execute GDC API, can you please retry with a template question? Error: {e}'
+    return result
+def visible_component(input_text):
+    return gr.update(value="WHATEVER")
+# Create Gradio interface
+with gr.Blocks(title="GDC QAG MCP server") as GDC_QAG_QUERY:
+    gr.Markdown(
+        """
+        # GDC QAG Service
+        """
+    )
+    with gr.Row():
+        query_input = gr.Textbox(
+            lines = 3,
+            label="Search Query",
+            placeholder='e.g. "What is the co-occurence frequency of somatic homozygous deletions in CDKN2A and CDKN2B in the mesothelioma project TCGA-MESO in the genomic data commons?"',
+            info="Required: Enter your search query",
+        )
+    search_button = gr.Button("Search", variant="primary")
+    output = gr.Textbox(
+        label="Query Result",
+        lines=10,
+        max_lines=25,
+        info="The Result of the Query will appear here",
+    )
+    search_button.click(
+        fn=wrapped_execute_pipeline,
+        inputs=[query_input],
+        outputs=output,
+    )
+if __name__ == "__main__":
+    GDC_QAG_QUERY.launch(mcp_server=True, show_api=True)