File size: 8,129 Bytes
426b9bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import streamlit as st
import torch
import os
import sys
import time
import json
from typing import List
import datasets
import csv
from transformers import LlamaTokenizer, LlamaForCausalLM
import tqdm
base_model_name = "EthioNLP/Amharic-llama-base-model"
adapters_name = 'EthioNLP/Amharic-LLAMA-all-data'
BASE_PROMPT = """Below is an interaction between a human and an AI fluent in English and Amharic, providing reliable and informative answers. The AI is supposed to answer test questions from the human with short responses saying just the answer and nothing else.
Human: {instruction}
Assistant [Amharic] : """
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaConfig
# Function to load the main model for text generation
def load_model(model_name, quantization):
model = LlamaForCausalLM.from_pretrained(
model_name,
return_dict=True,
load_in_8bit=quantization,
device_map='cuda:0',
low_cpu_mem_usage=True,
)
return model
# Function to load the PeftModel for performance optimization
def load_peft_model(model, peft_model):
peft_model = PeftModel.from_pretrained(model, peft_model,offload_folder='./')
return peft_model
# Loading the model from config to load FSDP checkpoints into that
def load_llama_from_config(config_path):
model_config = LlamaConfig.from_pretrained(config_path)
model = LlamaForCausalLM(config=model_config)
return model
def main(
model,
tokenizer,
datasource, # List of data sources to use, no default value
csv_file_path, # Path to the CSV file to save responses, no default value
max_new_tokens=100, # The maximum numbers of tokens to generate
seed=42, # seed value for reproducibility
do_sample=True, # Whether or not to use sampling; use greedy decoding otherwise.
min_length=None, # The minimum length of the sequence to be generated
use_cache=True, # [optional] Whether or not the model should use the past last key/values attentions
top_p=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature=1.0, # [optional] The value used to modulate the next token probabilities.
top_k=5, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
repetition_penalty=5.0, # The parameter for repetition penalty. 1.0 means no penalty.
length_penalty=1, # [optional] Exponential penalty to the length used with beam-based generation.
enable_azure_content_safety=False, # Enable safety check with Azure content safety API
enable_sensitive_topics=False, # Enable check for sensitive topics using AuditNLG APIs
enable_saleforce_content_safety=False, # Enable safety check with Salesforce safety T5
**kwargs # Additional arguments for the model.generate function
):
# Note: Ensure that the appropriate tokenizer is used for the language.
print("*** Ensure that you have replaced the default tokenizer with the appropriate one for your use case.")
model.eval()
# Load the dataset from Hugging Face
dataset = hf_dataset['test']
# Prepare the CSV file for saving responses
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Instruction', 'Input Text', 'Datasource','response', 'gold_label']) # Column headers
for item in tqdm.tqdm(dataset): # Change to the desired split if necessary
instruction = item['instruction'] # Extracting the instruction
input_text = item['input'] # Extracting the input text
datasource = item['datasource']
gold_label=item['output']
# Combine instruction and input_text for the prompt
user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
batch = tokenizer(user_prompt, return_tensors="pt")
batch = {k: v.to(model.device) for k, v in batch.items()} # Ensure tensors are on the same device as the model
start = time.perf_counter()
with torch.no_grad():
outputs = model.generate(
**batch,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=top_p,
temperature=temperature,
min_length=min_length,
use_cache=use_cache,
top_k=top_k,
repetition_penalty=repetition_penalty,
length_penalty=length_penalty,
**kwargs)
e2e_inference_time = (time.perf_counter() - start) * 1000
# print(f"Inference time: {e2e_inference_time} ms")
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):]
# print("Model Output: {}".format(output_text))
# Write the instruction, input text, and output to the CSV file
writer.writerow([instruction, input_text,datasource, output_text, gold_label])
torch.cuda.empty_cache()
# Example of how to use the function
model = load_model(base_model_name, quantization=True)
tokenizer = LlamaTokenizer.from_pretrained(adapters_name)
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) != embedding_size:
print("resize the embedding size by the size of the tokenizer")
model.resize_token_embeddings(len(tokenizer))
# Load adapter model
model.load_adapter(adapters_name)
BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
max_new_tokens=100 # The maximum numbers of tokens to generate
seed=42 # seed value for reproducibility
do_sample=True # Whether or not to use sampling; use greedy decoding otherwise.
min_length=None # The minimum length of the sequence to be generated
use_cache=True # [optional] Whether or not the model should use the past last key/values attentions
top_p=1.0 # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature=1.0 # [optional] The value used to modulate the next token probabilities.
top_k=5 # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
repetition_penalty=5.0 # The parameter for repetition penalty. 1.0 means no penalty.
length_penalty=1 # [optional] Exponential penalty to the length used with beam-based generation.
enable_azure_content_safety=False # Enable safety check with Azure content safety API
enable_sensitive_topics=False # Enable check for sensitive topics using AuditNLG APIs
enable_saleforce_content_safety=False
def predict(instruction,input_text=" "):
user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
batch = tokenizer(user_prompt, return_tensors="pt")
batch = {k: v.to(model.device) for k, v in batch.items()} # Ensure tensors are on the same device as the model
start = time.perf_counter()
# print(batch)
with torch.no_grad():
outputs = model.generate(
**batch,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
top_p=top_p,
temperature=temperature,
min_length=min_length,
use_cache=use_cache,
top_k=top_k,
repetition_penalty=repetition_penalty,
length_penalty=length_penalty)
e2e_inference_time = (time.perf_counter() - start) * 1000
# print(f"Inference time: {e2e_inference_time} ms")
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):]
return output_text
st.title('LLM Interaction Interface')
user_input = st.text_input("Ask a question:")
if user_input:
# This function is supposed to send the question to the LLM and get the response
response = predict(user_input)
st.text_area("Response:", value=response, height=300, max_chars=None, help=None)
# st.json({'value':response},expanded=False)
|