Ravis-gemini

Sleeping

App Files Files

nttwt1597 commited on Aug 27, 2024

Commit

a8be9e1

verified ·

1 Parent(s): bd5d11b

RAG + feedback update

Browse files

Files changed (1) hide show

app.py +161 -124

app.py CHANGED Viewed

@@ -1,86 +1,182 @@
 import os
-token=os.environ['token']
-# token_r=os.environ['token_r']
-# token_w=os.environ['token_w']
 import torch
 import gradio as gr
-from unsloth import FastLanguageModel
-from peft import PeftConfig, PeftModel, get_peft_model
-from transformers import pipeline, TextIteratorStreamer
-from threading import Thread
-# For getting tokenizer()
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-peft_model_adapter_id = "nttwt1597/test_v2_cancer_v4_checkpoint2900"
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = model_id,
-    max_seq_length = 4096,
-    dtype = None,
-    load_in_4bit = True,
 )
-model.load_adapter(peft_model_adapter_id, token=token)
-terminators = [
     tokenizer.eos_token_id,
-    tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
-FastLanguageModel.for_inference(model)
-criteria_prompt = """Based on the provided instructions and clinical trial information, generate the eligibility criteria for the study.
-### Instruction:
-As a clinical researcher, generate comprehensive eligibility criteria to be used in clinical research based on the given clinical trial information. Ensure the criteria are clear, specific, and suitable for a clinical research setting.
-### Clinical trial information:
-{}
-### Eligibility criteria:
-{}"""
-def format_prompt(text):
-    return criteria_prompt.format(text, "")
-def run_model_on_text(text):
-  prompt = format_prompt(text)
-  inputs = tokenizer(prompt, return_tensors='pt')
-  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-  generation_kwargs = dict(inputs, streamer=streamer,eos_token_id=terminators, max_new_tokens=1024, repetition_penalty=1.175,)
-  thread = Thread(target=model.generate, kwargs=generation_kwargs)
-  thread.start()
-  generated_text = ""
-  for new_text in streamer:
-    generated_text += new_text
-    yield generated_text
 place_holder = f"""Study Objectives
-The purpose of this study is to evaluate the safety, tolerance and efficacy of Liposomal Paclitaxel With Nedaplatin as First-line in patients with Advanced or Recurrent Esophageal Carcinoma
-Conditions: Esophageal Carcinoma
-Intervention / Treatment:
-DRUG: Liposomal Paclitaxel,
-DRUG: Nedaplatin
-Location: China
-Study Design and Phases
-Study Type: INTERVENTIONAL
-Phase: PHASE2 Primary Purpose:
-TREATMENT Allocation: NA
 Interventional Model: SINGLE_GROUP Masking: NONE
 """
 prefilled_value = """Study Objectives
-[Brief Summary] and/or [Detailed Description]
 Conditions: [Disease]
-Intervention / Treatment
 [DRUGs]
 Location
@@ -90,92 +186,33 @@ Study Design and Phases
 Study Type:
 Phase:
 Primary Purpose:
-Allocation:
 Interventional Model:
 Masking:"""
-# hf_writer = gr.HuggingFaceDatasetSaver("ravistech/criteria-feedback-demo",token, private=True)
-# with gr.Blocks() as demo:
-#     with gr.Row():
-#         with gr.Column():
-#             prompt_box = gr.Textbox(
-#                 label="Research Information",
-#                 placeholder=place_holder,
-#                 value=prefilled_value,
-#                 lines=10)
-#             submit_button = gr.Button("Generate")
-#         with gr.Column():
-#             output_box = gr.Textbox(
-#                 label="Eligiblecriteria Criteria",
-#                 lines=21,
-#                 interactive=False)
-#     with gr.Row():
-#         with gr.Column():
-#             feedback_box = gr.Textbox(label="Enter your feedback here...", lines=3, interactive=True)
-#             feedback_button = gr.Button("Submit Feedback")
-#             status_text = gr.Textbox(label="Status", lines=1, interactive=False)
-#     submit_button.click(
-#         run_model_on_text,
-#         inputs=prompt_box,
-#         outputs=output_box
-#     )
-#     def submit_feedback(prompt, generated_text, feedback):
-#         data = {
-#             "prompt": prompt,
-#             "generated_text": generated_text,
-#             "feedback": feedback
-#         }
-#         hf_writer.flag(data)
-#         return "Feedback submitted."
-#     feedback_button.click(
-#         submit_feedback,
-#         inputs=[prompt_box, output_box, feedback_box],
-#         outputs=status_text
-#     )
-    # feedback_button.click(
-    #     hf_writer.flag([prompt_box,output_box,feedback_box]),
-    #     # lambda *args: hf_writer.flag(args),
-    #     inputs=[prompt_box, output_box, feedback_box],
-    #     outputs=status_text,
-    #     )
-    # gr.Interface(lambda x:x, "text", "text", allow_flagging="manual", flagging_callback=hf_writer)
-    # feedback_button.click(
-    #     save_feedback,
-    #     inputs=[prompt_box, output_box, feedback_box],
-    #     outputs=status_text
-    # )
-# demo.launch()
-#----------------------------------
 prompt_box = gr.Textbox(
-    lines=25,
     label="Research Information",
-    placeholder=place_holder,
     value=prefilled_value,
 )
 output_box = gr.Textbox(
-    lines=25,
     label="Eligiblecriteria Criteria",
 )
 demo = gr.Interface(
-  fn=run_model_on_text,
   inputs=prompt_box,
   outputs=output_box,
-  # allow_flagging="manual",
-  # flagging_options=["incorrect", "inappropriate", "appropriate"],
-  # flagging_callback=hf_writer
 )
 demo.queue(max_size=20).launch(debug=True, share=True)

 import os
+token_r=os.environ['token_r']
+token_w=os.environ['token_w']
+token_w_feedback=os.environ['token_w_feedback']
+from llama_index.core import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+!pip install -qU llama-index-vector-stores-elasticsearch llama-index-embeddings-huggingface llama-index
+from llama_index.vector_stores.elasticsearch import ElasticsearchStore
+from llama_index.core.query_engine import CitationQueryEngine
+from llama_index.core import VectorStoreIndex
 import torch
+from transformers import AutoTokenizer
+from llama_index.llms.huggingface import HuggingFaceLLM
+from transformers import BitsAndBytesConfig
 import gradio as gr
+model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    token=token_r,
 )
+stopping_ids = [
     tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
 ]
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
+# Get the model
+llm = HuggingFaceLLM(
+    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+    tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
+    model_kwargs={
+        "token": token_r,
+        "quantization_config": quantization_config
+    },
+    context_window=8191,
+    max_new_tokens=2048,
+    generate_kwargs={
+        # "do_sample": True,
+        # "temperature": 0.1,
+        # "top_p": 0.9,
+        'repetition_penalty': 1,
+    },
+    stopping_ids=stopping_ids,
+)
+# bge embedding model
+embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
+Settings.embed_model = embed_model
+# Llama-3-8B-Instruct model
+Settings.llm = llm
+# Get data from Elasticsearch
+es_vector_store = ElasticsearchStore(
+    index_name="train_criteria_index",
+    es_cloud_id=es_cloud_id
+    es_user="elastic",
+    es_password=es_password
+)
+index_es = VectorStoreIndex.from_vector_store(es_vector_store)
+query_engine_get_study = CitationQueryEngine.from_args(
+    index_es,
+    similarity_top_k=10,
+    citation_chunk_size=2048,
+    verbose=True,
+)
+def get_prompt(text):
+  studies_response = query_engine_get_study.query(f"""
+  Based on the provided instructions and clinical trial information, What are the eligibility criteria based on the given clinical trial information.
+  Ensure the studies are relevant and have similar study information. Prioritize the following topics when finding related studies:
+  1. Conditions
+  2. Intervention/Treatment
+  3. Study Objectives
+  4. Study Design and Phases
+  ### Clinical Trial Information:
+  {text}
+  """)
+  study_ref=[]
+  metadata_list = []
+  for source in studies_response.source_nodes:
+    ref = source.node.get_text()
+    study_ref.append(ref)
+    meta_data = source.node.get_metadata_str()
+    metadata_list.append(meta_data)
+  # return
+  criteria_response = llm.stream_complete(f"""
+  Based on the provided instructions and clinical trial information, generate the eligibility criteria for the study.
+  ## Instruction:
+  You are a clinical researcher able to generate new comprehensive eligibility criteria for clinical research based on the given clinical trial information.
+  By analyze clinical trial information, delimited by ### Clinical Trial Information, and the information from the following papers, delimited by ### Related data, by choose the suitable criteria and optimize for the given clinical trial information for more precise new eligibility criteria generation.
+  And please giving us an NCT IDs and study names using the following papers, delimited by ### Reference Papers.
+  The pattern of the output is delimited by ### Pattern of the output.
+  Ensure the criteria are clear, specific, and suitable for a clinical research information.
+  Prioritize the following topics from the clinical trial information
+  1. Conditions
+  2. Intervention/Treatment
+  3. Study Objectives
+  4. Study Design and Phase
+  ### Clinical Trial Information
+  {text}
+  ### Related data
+  {study_ref}
+  ### Reference Papers
+  {metadata_list}
+  ### Pattern of the output
+  Inclusion Criteria
+  1.
+  2.
+  Exclusion Criteria
+  1.
+  2.
+  Reference Papers
+  1. NCT ID:
+    Study Name:
+  2. NCT ID:
+    Study Name:
+  3. NCT ID:
+    Study Name:
+    """)
+  for chunk in criteria_response:
+    yield chunk
 place_holder = f"""Study Objectives
+The purpose of this study is to evaluate the safety, tolerance and efficacy of Liposomal Paclitaxel With Nedaplatin as First-line in patients with Advanced or Recurrent Esophageal Carcinoma
+Conditions: Esophageal Carcinoma
+Intervention / Treatment:
+DRUG: Liposomal Paclitaxel,
+DRUG: Nedaplatin
+Location: China
+Study Design and Phases
+Study Type: INTERVENTIONAL
+Phase: PHASE2 Primary Purpose:
+TREATMENT Allocation: NA
 Interventional Model: SINGLE_GROUP Masking: NONE
 """
 prefilled_value = """Study Objectives
+[Brief Summary and/or Detailed Description]
 Conditions: [Disease]
+Intervention / Treatment
 [DRUGs]
 Location
 Study Type:
 Phase:
 Primary Purpose:
+Allocation:
 Interventional Model:
 Masking:"""
+hf_writer = gr.HuggingFaceDatasetSaver(hf_token=token_w_feedback, dataset_name="nttwt1597/criteria-feedback-demo-1", private=True)
 prompt_box = gr.Textbox(
+    lines=10,
     label="Research Information",
+    # placeholder=place_holder,
     value=prefilled_value,
 )
 output_box = gr.Textbox(
+    lines=10,
     label="Eligiblecriteria Criteria",
 )
 demo = gr.Interface(
+  fn=get_prompt,
   inputs=prompt_box,
   outputs=output_box,
+  # allow_flagging='auto',
+  allow_flagging="manual",
+  flagging_options=["appropriate","inappropriate","incorrect",],
+  flagging_callback=hf_writer,
+  # live=True
 )
 demo.queue(max_size=20).launch(debug=True, share=True)