Files changed (2) hide show
  1. app.py +257 -19
  2. old_app.py +109 -0
app.py CHANGED
@@ -1,23 +1,24 @@
1
  import os
2
-
3
- import json
4
  import gradio as gr
5
  import pandas as pd
6
  import spaces
7
  import torch
8
  from methods import gdc_api_calls, utilities
9
- from gdc_pipeline import execute_pipeline, setup_args, setup_models_and_data
10
  from transformers import AutoTokenizer, BertTokenizer, AutoModelForCausalLM, BertForSequenceClassification
 
 
 
 
 
11
 
12
 
 
13
  working_llama_token = os.environ.get("let_this_please_work", False)
14
  hf_TOKEN = os.environ.get("fineTest", False)
15
  intent_token = os.environ.get("query_intent_test", False)
16
 
17
-
18
- # setup models and data
19
- # qag_requirements = setup_models_and_data(hf_TOKEN, working_llama_token, intent_token)
20
-
21
  print("getting gdc project information")
22
  project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
23
 
@@ -50,26 +51,262 @@ model = AutoModelForCausalLM.from_pretrained(
50
  model = model.to('cuda').eval()
51
 
52
 
53
- # question = 'What is the co-occurence frequency of somatic homozygous deletions in CDKN2A and CDKN2B in the mesothelioma project TCGA-MESO in the genomic data commons?'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- def wrapped_execute_pipeline(question: str):
56
- df = pd.DataFrame({'questions' : [question]})
57
- print(f'Question received: {question}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
- result = execute_pipeline(
60
- df,
 
 
 
 
61
  gdc_genes_mutations,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  model,
63
  tok,
64
- intent_model,
65
- intent_tok,
66
  project_mappings,
67
- output_file_prefix=None
 
68
  )
69
- except Exception as e:
70
- result = f'Unable to execute GDC API, can you please retry with a template question? Error: {e}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  return result
72
 
 
 
73
  def visible_component(input_text):
74
  return gr.update(value="WHATEVER")
75
 
@@ -100,10 +337,11 @@ with gr.Blocks(title="GDC QAG MCP server") as GDC_QAG_QUERY:
100
  )
101
 
102
  search_button.click(
103
- fn=wrapped_execute_pipeline,
104
  inputs=[query_input],
105
  outputs=output,
106
  )
107
 
 
108
  if __name__ == "__main__":
109
  GDC_QAG_QUERY.launch(mcp_server=True, show_api=True)
 
1
  import os
2
+ from types import SimpleNamespace
 
3
  import gradio as gr
4
  import pandas as pd
5
  import spaces
6
  import torch
7
  from methods import gdc_api_calls, utilities
 
8
  from transformers import AutoTokenizer, BertTokenizer, AutoModelForCausalLM, BertForSequenceClassification
9
+ from guidance import gen as guidance_gen
10
+ from guidance.models import Transformers
11
+ from transformers import set_seed
12
+
13
+ from methods import gdc_api_calls, utilities
14
 
15
 
16
+ # set up various tokens
17
  working_llama_token = os.environ.get("let_this_please_work", False)
18
  hf_TOKEN = os.environ.get("fineTest", False)
19
  intent_token = os.environ.get("query_intent_test", False)
20
 
21
+ # set up requirements: models and data
 
 
 
22
  print("getting gdc project information")
23
  project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
24
 
 
51
  model = model.to('cuda').eval()
52
 
53
 
54
+ # execute_api_call
55
+ def execute_api_call(
56
+ intent,
57
+ gene_entities,
58
+ mutation_entities,
59
+ cancer_entities,
60
+ query,
61
+ gdc_genes_mutations,
62
+ project_mappings,
63
+ ):
64
+ if intent == "ssm_frequency":
65
+ result, cancer_entities = utilities.get_ssm_frequency(
66
+ gene_entities, mutation_entities, cancer_entities, project_mappings
67
+ )
68
+ elif intent == "top_mutated_genes_by_project":
69
+ result = gdc_api_calls.get_top_mutated_genes_by_project(
70
+ cancer_entities, top_k=10
71
+ )
72
+ elif intent == "most_frequently_mutated_gene":
73
+ result = gdc_api_calls.get_top_mutated_genes_by_project(
74
+ cancer_entities, top_k=1
75
+ )
76
+ elif intent == "freq_cnv_loss_or_gain":
77
+ result, cancer_entities = gdc_api_calls.get_freq_cnv_loss_or_gain(
78
+ gene_entities, cancer_entities, query, cnv_and_ssm_flag=False
79
+ )
80
+ elif intent == "msi_h_frequency":
81
+ result, cancer_entities = gdc_api_calls.get_msi_frequency(cancer_entities)
82
+ elif intent == "cnv_and_ssm":
83
+ result, cancer_entities = utilities.get_freq_of_cnv_and_ssms(
84
+ query, cancer_entities, gene_entities, gdc_genes_mutations
85
+ )
86
+ elif intent == "top_cases_counts_by_gene":
87
+ result, cancer_entities = gdc_api_calls.get_top_cases_counts_by_gene(
88
+ gene_entities, cancer_entities
89
+ )
90
+ elif intent == "project_summary":
91
+ result = gdc_api_calls.get_project_summary(cancer_entities)
92
+ else:
93
+ result = "user intent not recognized, or use case not covered"
94
+ return result, cancer_entities
95
+
96
+
97
+ # function to combine entities, intent and API call
98
+ def construct_and_execute_api_call(
99
+ query, gdc_genes_mutations, project_mappings, intent_model, intent_tok
100
+ ):
101
+ print("query:\n{}\n".format(query))
102
+ # Infer entities
103
+ initial_cancer_entities = utilities.return_initial_cancer_entities(
104
+ query, model="en_ner_bc5cdr_md"
105
+ )
106
 
107
+ if not initial_cancer_entities:
108
+ try:
109
+ initial_cancer_entities = utilities.return_initial_cancer_entities(
110
+ query, model="en_core_sci_md"
111
+ )
112
+ except Exception as e:
113
+ print("unable to guess cancer entities {}".format(str(e)))
114
+ initial_cancer_entities = []
115
+
116
+ cancer_entities = utilities.postprocess_cancer_entities(
117
+ project_mappings, initial_cancer_entities=initial_cancer_entities, query=query
118
+ )
119
+
120
+ # if cancer entities is empty from above methods return all projects
121
+ if not cancer_entities:
122
+ cancer_entities = list(project_mappings.keys())
123
+ gene_entities = utilities.infer_gene_entities_from_query(query, gdc_genes_mutations)
124
+ mutation_entities = utilities.infer_mutation_entities(
125
+ gene_entities=gene_entities,
126
+ query=query,
127
+ gdc_genes_mutations=gdc_genes_mutations,
128
+ )
129
+
130
+ print("gene entities {}".format(gene_entities))
131
+ print("mutation entities {}".format(mutation_entities))
132
+ print("cancer entities {}".format(cancer_entities))
133
+
134
+ # infer user intent
135
+ intent = utilities.infer_user_intent(query, intent_model, intent_tok)
136
+ print("user intent:\n{}\n".format(intent))
137
  try:
138
+ api_call_result, cancer_entities = execute_api_call(
139
+ intent,
140
+ gene_entities,
141
+ mutation_entities,
142
+ cancer_entities,
143
+ query,
144
  gdc_genes_mutations,
145
+ project_mappings,
146
+ )
147
+ print("api_call_result {}".format(api_call_result))
148
+ except Exception as e:
149
+ print("unable to process query {} {}".format(query, str(e)))
150
+ api_call_result = []
151
+ cancer_entities = []
152
+ return SimpleNamespace(
153
+ helper_output=api_call_result,
154
+ cancer_entities=cancer_entities,
155
+ intent=intent,
156
+ gene_entities=gene_entities,
157
+ mutation_entities=mutation_entities,
158
+ )
159
+
160
+
161
+ # generate llama model response
162
+ @spaces.GPU(duration=30)
163
+ def generate_response(modified_query, model, tok):
164
+ set_seed(1042)
165
+ regex = "The final answer is: \d*\.\d*%"
166
+ lm = Transformers(model=model, tokenizer=tok)
167
+ lm += modified_query
168
+ print(f"lm: {lm}")
169
+ lm += guidance_gen(
170
+ "gen_response",
171
+ n=1,
172
+ temperature=0,
173
+ max_tokens=1000,
174
+ regex=regex
175
+ )
176
+ print(f"lm with response: {lm}")
177
+ return lm["gen_response"]
178
+
179
+
180
+ def batch_test(
181
+ query,
182
+ model,
183
+ tok,
184
+ gdc_genes_mutations,
185
+ project_mappings,
186
+ intent_model,
187
+ intent_tok
188
+ ):
189
+ modified_query = utilities.construct_modified_query_base_llm(query)
190
+ print(f"modified_query is: {modified_query}")
191
+ llama_base_output = generate_response(modified_query, model, tok)
192
+ print(f"llama_base_output: {llama_base_output}")
193
+ try:
194
+ result = construct_and_execute_api_call(
195
+ query, gdc_genes_mutations, project_mappings, intent_model, intent_tok
196
+ )
197
+ except Exception as e:
198
+ # unable to compute at this time, recheck
199
+ result.helper_output = []
200
+ result.cancer_entities = []
201
+ # if there is not a helper output for each unique cancer entity
202
+ # log error to inspect and reprocess query later
203
+ try:
204
+ len(result.helper_output) == len(result.cancer_entities)
205
+ except Exception as e:
206
+ msg = "there is not a unique helper output for each unique \
207
+ cancer entity in {}".format(
208
+ query
209
+ )
210
+ print("exception {}".format(msg))
211
+ result.helper_output = []
212
+ result.cancer_entities = []
213
+
214
+ return pd.Series(
215
+ [
216
+ llama_base_output,
217
+ result.helper_output,
218
+ result.cancer_entities,
219
+ result.intent,
220
+ result.gene_entities,
221
+ result.mutation_entities,
222
+ ]
223
+ )
224
+
225
+
226
+ def get_prefinal_response(row, model, tok):
227
+ try:
228
+ query = row["questions"]
229
+ helper_output = row["helper_output"]
230
+ except Exception as e:
231
+ print(f"unable to retrieve query: {query} or helper_output: {helper_output}")
232
+ modified_query = utilities.construct_modified_query(query, helper_output)
233
+ prefinal_llama_with_helper_output = generate_response(modified_query, model, tok)
234
+ return pd.Series([modified_query, prefinal_llama_with_helper_output])
235
+
236
+
237
+
238
+ def execute_pipeline(question: str):
239
+ df = pd.DataFrame({'questions' : [question]})
240
+ print(f'Question received: {question}')
241
+ print("starting pipeline")
242
+ print("CUDA available:", torch.cuda.is_available())
243
+ print("CUDA device name:", torch.cuda.get_device_name(0))
244
+
245
+ # queries input file
246
+ print(f"running test on input {df}")
247
+ df[
248
+ [
249
+ "llama_base_output",
250
+ "helper_output",
251
+ "cancer_entities",
252
+ "intent",
253
+ "gene_entities",
254
+ "mutation_entities",
255
+ ]
256
+ ] = df["questions"].apply(
257
+ lambda x: batch_test(
258
+ x,
259
  model,
260
  tok,
261
+ gdc_genes_mutations,
 
262
  project_mappings,
263
+ intent_model,
264
+ intent_tok
265
  )
266
+ )
267
+ # retain responses with helper output
268
+ df["len_helper"] = df["helper_output"].apply(lambda x: len(x))
269
+ df_filtered = df[df["len_helper"] != 0]
270
+ df_filtered["len_ce"] = df_filtered["cancer_entities"].apply(lambda x: len(x))
271
+ # retain rows where one response is retrieved for each cancer entity
272
+ df_filtered["ce_eq_helper"] = df_filtered.apply(
273
+ lambda x: x["len_ce"] == x["len_helper"], axis=1
274
+ )
275
+ df_filtered = df_filtered[df_filtered["ce_eq_helper"]]
276
+ df_filtered_exploded = df_filtered.explode(
277
+ ["helper_output", "cancer_entities"], ignore_index=True
278
+ )
279
+ df_filtered_exploded[["modified_prompt", "pre_final_llama_with_helper_output"]] = (
280
+ df_filtered_exploded.apply(
281
+ lambda x: get_prefinal_response(x, model, tok), axis=1
282
+ )
283
+ )
284
+ ### postprocess response
285
+ print("postprocessing response")
286
+ df_filtered_exploded[
287
+ [
288
+ "llama_base_stat",
289
+ "delta_llama",
290
+ "value_changed",
291
+ "ground_truth_stat",
292
+ "generated_stat_prefinal",
293
+ "delta_prefinal",
294
+ "generated_stat_final",
295
+ "delta_final",
296
+ "final_response",
297
+ ]
298
+ ] = df_filtered_exploded.apply(
299
+ lambda x: utilities.postprocess_response(x), axis=1
300
+ )
301
+
302
+ final_columns = utilities.get_final_columns()
303
+ result = df_filtered_exploded[final_columns].T
304
+ print('result {}'.format(result))
305
+ print('completed')
306
  return result
307
 
308
+
309
+
310
  def visible_component(input_text):
311
  return gr.update(value="WHATEVER")
312
 
 
337
  )
338
 
339
  search_button.click(
340
+ fn=execute_pipeline,
341
  inputs=[query_input],
342
  outputs=output,
343
  )
344
 
345
+
346
  if __name__ == "__main__":
347
  GDC_QAG_QUERY.launch(mcp_server=True, show_api=True)
old_app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import json
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import spaces
7
+ import torch
8
+ from methods import gdc_api_calls, utilities
9
+ from gdc_pipeline import execute_pipeline, setup_args, setup_models_and_data
10
+ from transformers import AutoTokenizer, BertTokenizer, AutoModelForCausalLM, BertForSequenceClassification
11
+
12
+
13
+ working_llama_token = os.environ.get("let_this_please_work", False)
14
+ hf_TOKEN = os.environ.get("fineTest", False)
15
+ intent_token = os.environ.get("query_intent_test", False)
16
+
17
+
18
+ # setup models and data
19
+ # qag_requirements = setup_models_and_data(hf_TOKEN, working_llama_token, intent_token)
20
+
21
+ print("getting gdc project information")
22
+ project_mappings = gdc_api_calls.get_gdc_project_ids(start=0, stop=86)
23
+
24
+ print('loading intent model')
25
+ model_id = 'uc-ctds/query_intent'
26
+ intent_tok = AutoTokenizer.from_pretrained(
27
+ model_id, trust_remote_code=True,
28
+ token=intent_token
29
+ )
30
+ intent_model = BertForSequenceClassification.from_pretrained(
31
+ model_id, token=intent_token)
32
+ intent_model = intent_model.to('cuda').eval()
33
+
34
+
35
+ print("loading gdc genes and mutations")
36
+ gdc_genes_mutations = utilities.load_gdc_genes_mutations_hf(hf_TOKEN)
37
+
38
+ print("loading llama-3B model")
39
+ model_id = "meta-llama/Llama-3.2-3B-Instruct"
40
+ tok = AutoTokenizer.from_pretrained(
41
+ model_id, trust_remote_code=True,
42
+ token=working_llama_token
43
+ )
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ torch_dtype=torch.float16,
47
+ trust_remote_code=True,
48
+ token=working_llama_token
49
+ )
50
+ model = model.to('cuda').eval()
51
+
52
+
53
+ # question = 'What is the co-occurence frequency of somatic homozygous deletions in CDKN2A and CDKN2B in the mesothelioma project TCGA-MESO in the genomic data commons?'
54
+
55
+ def wrapped_execute_pipeline(question: str):
56
+ df = pd.DataFrame({'questions' : [question]})
57
+ print(f'Question received: {question}')
58
+ try:
59
+ result = execute_pipeline(
60
+ df,
61
+ gdc_genes_mutations,
62
+ model,
63
+ tok,
64
+ intent_model,
65
+ intent_tok,
66
+ project_mappings,
67
+ output_file_prefix=None
68
+ )
69
+ except Exception as e:
70
+ result = f'Unable to execute GDC API, can you please retry with a template question? Error: {e}'
71
+ return result
72
+
73
+ def visible_component(input_text):
74
+ return gr.update(value="WHATEVER")
75
+
76
+
77
+ # Create Gradio interface
78
+ with gr.Blocks(title="GDC QAG MCP server") as GDC_QAG_QUERY:
79
+ gr.Markdown(
80
+ """
81
+ # GDC QAG Service
82
+ """
83
+ )
84
+
85
+ with gr.Row():
86
+ query_input = gr.Textbox(
87
+ lines = 3,
88
+ label="Search Query",
89
+ placeholder='e.g. "What is the co-occurence frequency of somatic homozygous deletions in CDKN2A and CDKN2B in the mesothelioma project TCGA-MESO in the genomic data commons?"',
90
+ info="Required: Enter your search query",
91
+ )
92
+
93
+ search_button = gr.Button("Search", variant="primary")
94
+
95
+ output = gr.Textbox(
96
+ label="Query Result",
97
+ lines=10,
98
+ max_lines=25,
99
+ info="The Result of the Query will appear here",
100
+ )
101
+
102
+ search_button.click(
103
+ fn=wrapped_execute_pipeline,
104
+ inputs=[query_input],
105
+ outputs=output,
106
+ )
107
+
108
+ if __name__ == "__main__":
109
+ GDC_QAG_QUERY.launch(mcp_server=True, show_api=True)