File size: 5,107 Bytes
3ff9cd0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from tqdm import tqdm
def generate_response(model,tokenizer,setup_prompt):
model_inputs = tokenizer(setup_prompt,return_tensors = "pt",device_map="auto")#.to("mps")
output = model.generate(**model_inputs , max_length = 1024, pad_token_id= tokenizer.eos_token_id,
eos_token_id= tokenizer.eos_token_id)
question_to_claims = tokenizer.decode(output[0], skip_special_tokens=True)
prompt_tokens = len(setup_prompt.split())
response = ' '.join(question_to_claims.split()[prompt_tokens:])
#print(f"Test this:{question_to_claims.split()[prompt_tokens]}")
#return response
return question_to_claims.split()[prompt_tokens]
def naive_prompt(context,hypothesis):
prompt = f'''
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful AI assistant to do Natural Language Inference.\nYou are given a CONTEXT and HYPOTHESIS and you will predict a label ONLY from the set ENTAILMENT,CONTRADICTION,NEUTRAL.
<|start_header_id|>user<|end_header_id|>
CONTEXT: {context}
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
HYPOTHESIS: {hypothesis}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>#Answer:
'''
return prompt
def naive_prompt_more_hc_prime(context,hypothesis):
prompt = f'''
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful AI assistant to do Natural Language Inference.\nYou are given a CONTEXT and HYPOTHESIS and you will predict a label ONLY from the set ENTAILMENT,CONTRADICTION,NEUTRAL.
<|start_header_id|>user<|end_header_id|>
CONTEXT: The girl is wearing shoes
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
HYPOTHESIS: A girl asleep on a hard wood floor cuddling her baby doll
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>#Answer:
"neutral"
<|eot_id|>
CONTEXT: The girl is watching TV
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
HYPOTHESIS: A girl sleeping on the floor with her dolls
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>#Answer:
"neutral"
<|eot_id|>
CONTEXT: {context}
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
HYPOTHESIS: {hypothesis}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>#Answer:
'''
return prompt
def read_data(file_path):
file_content = []
with open(file_path,'r') as file:
for line in file:
line = json.loads(line)
sentence1 = line['sentence1']
sentence2 = line['sentence2']
gold_label = line['gold_label']
json_object = {
'sentence1':sentence1,
'sentence2':sentence2,
'label':gold_label
}
file_content.append(json_object)
return file_content
def llama3_snli():
#device_map = "mps"
device_map = "auto"
model = AutoModelForCausalLM.from_pretrained(
"/Users/sbhar/Riju/PhDCode/RAG_LLama/nebula-rag-code/llama3-8B-Instruct-hf",
return_dict=True,
torch_dtype=torch.float16,
device_map=device_map,
)
print("BaseLine Model Loaded !!")
print("-------------------------------------")
model = PeftModel.from_pretrained(model, "/Users/sbhar/Riju/PhDCode/SNLI-FT/model/snli-adapter", device_map=device_map)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained("/Users/sbhar/Riju/PhDCode/RAG_LLama/nebula-rag-code/llama3-8B-Instruct-hf", use_fast=True,trust_remote_code=True)
tokenizer.pad_token_id = 18610
tokenizer.padding_side = "right"
print("Fine tuned Model and tokenizer Loaded Locally !!")
file_content = read_data('/Users/sbhar/Riju/PhDCode/SNLI-FT/data/snli_1.0/snli_1.0_test.jsonl')
pred_label_file = '/Users/sbhar/Riju/PhDCode/SNLI-FT/output/pred_outputs.json'
pred_outputs = {}
#print(file_content[0])
for i,item in enumerate(tqdm(file_content,desc="Predicting Labels")):
context = item['sentence1']
hypothesis = item['sentence2']
prompt = naive_prompt(context=context,hypothesis=hypothesis)
pred_label = generate_response(model=model,tokenizer=tokenizer,setup_prompt=prompt)
pred_outputs[i] = pred_label
with open(pred_label_file,'w') as file:
json.dump(pred_outputs,file)
print("All Predictions Dumped !!")
"""
context = file_content[500]['sentence1']
hypothesis = file_content[500]['sentence2']
print(context)
print(hypothesis)
print(file_content[500]['label'])
prompt = naive_prompt(context,hypothesis)
answer = generate_response(model=model,tokenizer=tokenizer,setup_prompt=prompt)
print(answer)
"""
if __name__ == "__main__":
llama3_snli()
|