deplot_plus_llm

Runtime error

App Files Files Community

Fangyu Liu commited on Apr 3, 2023

Commit

8fc7477

1 Parent(s): 6663c9a

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -27

app.py CHANGED Viewed

@@ -46,32 +46,105 @@ Q: Which party has the second highest favor rates in 2007?
 A: Let's find the row of year 2007, that's Row 3. Let's extract the numbers on Row 3: [59.0, 38.0, 45.0]. 45.0 is the second highest. 45.0 is the number of Independents. The answer is Independents.
 {_INSTRUCTION}"""
-def text_generate(prompt, table, problem):
-    p = prompt  + "\n" + table + "\n" + "Q: " + problem
-    # print(f"Final prompt is : {p}")
-    json_ = {"inputs": p,
-            "parameters":
-            {
-            "top_p": 0.9,
-          "temperature": 1.1,
-          "max_new_tokens": 128,
-          "return_full_text": True
-          }, "options":
-              {
-              "use_cache": True,
-              "wait_for_model":True
-              },}
-    response = requests.post(API_URL, headers=headers, json=json_)
-    print(f"Response  is : {response}")
-    output = response.json()
-    print(f"output is : {output}") #{output}")
-    output_tmp = output['generated_text']
-    print(f"output_tmp is: {output_tmp}")
-    #solution = output_tmp.split("\nQ:")[0]   #output[0]['generated_text'].split("Q:")[0] # +"."
-    #print(f"Final response after splits is: {solution}")
-    #return solution
-    return output_tmp
@@ -86,7 +159,7 @@ def process_document(image, question):
     table = processor_deplot.decode(predictions[0], skip_special_tokens=True)
     # send prompt+table to LLM
-    res = text_generate(_TEMPLATE, table, question)
     print (res)
 description = "Demo for deplot+llm for QA or summarisation. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."

 A: Let's find the row of year 2007, that's Row 3. Let's extract the numbers on Row 3: [59.0, 38.0, 45.0]. 45.0 is the second highest. 45.0 is the number of Independents. The answer is Independents.
 {_INSTRUCTION}"""
+import torch
+from peft import PeftModel
+import transformers
+import gradio as gr
+assert (
+    "LlamaTokenizer" in transformers._import_structure["models.llama"]
+), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
+from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
+tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
+BASE_MODEL = "decapoda-research/llama-7b-hf"
+LORA_WEIGHTS = "tloen/alpaca-lora-7b"
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+try:
+    if torch.backends.mps.is_available():
+        device = "mps"
+except:
+    pass
+if device == "cuda":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        load_in_8bit=False,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    model = PeftModel.from_pretrained(
+        model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
+    )
+elif device == "mps":
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+    model = PeftModel.from_pretrained(
+        model,
+        LORA_WEIGHTS,
+        device_map={"": device},
+        torch_dtype=torch.float16,
+    )
+else:
+    model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
+    )
+    model = PeftModel.from_pretrained(
+        model,
+        LORA_WEIGHTS,
+        device_map={"": device},
+    )
+if device != "cpu":
+    model.half()
+model.eval()
+if torch.__version__ >= "2":
+    model = torch.compile(model)
+def evaluate(
+    table,
+    question,
+    input=None,
+    temperature=0.1,
+    top_p=0.75,
+    top_k=40,
+    num_beams=4,
+    max_new_tokens=128,
+    **kwargs,
+):
+    prompt = _TEMPLATE + "\n" + table + "\n" + "Q: " + question
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        num_beams=num_beams,
+        **kwargs,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+            output_scores=True,
+            max_new_tokens=max_new_tokens,
+        )
+    s = generation_output.sequences[0]
+    output = tokenizer.decode(s)
+    return output.split("### Response:")[1].strip()
     table = processor_deplot.decode(predictions[0], skip_special_tokens=True)
     # send prompt+table to LLM
+    res = evaluate(table, question)
     print (res)
 description = "Demo for deplot+llm for QA or summarisation. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."