Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import time
|
6 |
+
import json
|
7 |
+
from typing import List
|
8 |
+
import datasets
|
9 |
+
import csv
|
10 |
+
from transformers import LlamaTokenizer, LlamaForCausalLM
|
11 |
+
import tqdm
|
12 |
+
|
13 |
+
|
14 |
+
base_model_name = "EthioNLP/Amharic-llama-base-model"
|
15 |
+
adapters_name = 'EthioNLP/Amharic-LLAMA-all-data'
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
BASE_PROMPT = """Below is an interaction between a human and an AI fluent in English and Amharic, providing reliable and informative answers. The AI is supposed to answer test questions from the human with short responses saying just the answer and nothing else.
|
21 |
+
|
22 |
+
Human: {instruction}
|
23 |
+
|
24 |
+
Assistant [Amharic] : """
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
from peft import PeftModel
|
29 |
+
from transformers import LlamaForCausalLM, LlamaConfig
|
30 |
+
|
31 |
+
# Function to load the main model for text generation
|
32 |
+
def load_model(model_name, quantization):
|
33 |
+
model = LlamaForCausalLM.from_pretrained(
|
34 |
+
model_name,
|
35 |
+
return_dict=True,
|
36 |
+
load_in_8bit=quantization,
|
37 |
+
device_map='cuda:0',
|
38 |
+
low_cpu_mem_usage=True,
|
39 |
+
)
|
40 |
+
return model
|
41 |
+
|
42 |
+
|
43 |
+
# Function to load the PeftModel for performance optimization
|
44 |
+
def load_peft_model(model, peft_model):
|
45 |
+
peft_model = PeftModel.from_pretrained(model, peft_model,offload_folder='./')
|
46 |
+
return peft_model
|
47 |
+
|
48 |
+
# Loading the model from config to load FSDP checkpoints into that
|
49 |
+
def load_llama_from_config(config_path):
|
50 |
+
model_config = LlamaConfig.from_pretrained(config_path)
|
51 |
+
model = LlamaForCausalLM(config=model_config)
|
52 |
+
return model
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
def main(
|
57 |
+
model,
|
58 |
+
tokenizer,
|
59 |
+
datasource, # List of data sources to use, no default value
|
60 |
+
csv_file_path, # Path to the CSV file to save responses, no default value
|
61 |
+
max_new_tokens=100, # The maximum numbers of tokens to generate
|
62 |
+
seed=42, # seed value for reproducibility
|
63 |
+
do_sample=True, # Whether or not to use sampling; use greedy decoding otherwise.
|
64 |
+
min_length=None, # The minimum length of the sequence to be generated
|
65 |
+
use_cache=True, # [optional] Whether or not the model should use the past last key/values attentions
|
66 |
+
top_p=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
|
67 |
+
temperature=1.0, # [optional] The value used to modulate the next token probabilities.
|
68 |
+
top_k=5, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
69 |
+
repetition_penalty=5.0, # The parameter for repetition penalty. 1.0 means no penalty.
|
70 |
+
length_penalty=1, # [optional] Exponential penalty to the length used with beam-based generation.
|
71 |
+
enable_azure_content_safety=False, # Enable safety check with Azure content safety API
|
72 |
+
enable_sensitive_topics=False, # Enable check for sensitive topics using AuditNLG APIs
|
73 |
+
enable_saleforce_content_safety=False, # Enable safety check with Salesforce safety T5
|
74 |
+
**kwargs # Additional arguments for the model.generate function
|
75 |
+
):
|
76 |
+
# Note: Ensure that the appropriate tokenizer is used for the language.
|
77 |
+
print("*** Ensure that you have replaced the default tokenizer with the appropriate one for your use case.")
|
78 |
+
|
79 |
+
model.eval()
|
80 |
+
|
81 |
+
# Load the dataset from Hugging Face
|
82 |
+
dataset = hf_dataset['test']
|
83 |
+
|
84 |
+
# Prepare the CSV file for saving responses
|
85 |
+
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
|
86 |
+
writer = csv.writer(file)
|
87 |
+
writer.writerow(['Instruction', 'Input Text', 'Datasource','response', 'gold_label']) # Column headers
|
88 |
+
|
89 |
+
for item in tqdm.tqdm(dataset): # Change to the desired split if necessary
|
90 |
+
instruction = item['instruction'] # Extracting the instruction
|
91 |
+
input_text = item['input'] # Extracting the input text
|
92 |
+
datasource = item['datasource']
|
93 |
+
gold_label=item['output']
|
94 |
+
|
95 |
+
# Combine instruction and input_text for the prompt
|
96 |
+
user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
|
97 |
+
|
98 |
+
batch = tokenizer(user_prompt, return_tensors="pt")
|
99 |
+
batch = {k: v.to(model.device) for k, v in batch.items()} # Ensure tensors are on the same device as the model
|
100 |
+
|
101 |
+
start = time.perf_counter()
|
102 |
+
|
103 |
+
with torch.no_grad():
|
104 |
+
outputs = model.generate(
|
105 |
+
**batch,
|
106 |
+
max_new_tokens=max_new_tokens,
|
107 |
+
do_sample=do_sample,
|
108 |
+
top_p=top_p,
|
109 |
+
temperature=temperature,
|
110 |
+
min_length=min_length,
|
111 |
+
use_cache=use_cache,
|
112 |
+
top_k=top_k,
|
113 |
+
repetition_penalty=repetition_penalty,
|
114 |
+
length_penalty=length_penalty,
|
115 |
+
**kwargs)
|
116 |
+
|
117 |
+
e2e_inference_time = (time.perf_counter() - start) * 1000
|
118 |
+
# print(f"Inference time: {e2e_inference_time} ms")
|
119 |
+
|
120 |
+
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):]
|
121 |
+
# print("Model Output: {}".format(output_text))
|
122 |
+
|
123 |
+
# Write the instruction, input text, and output to the CSV file
|
124 |
+
writer.writerow([instruction, input_text,datasource, output_text, gold_label])
|
125 |
+
torch.cuda.empty_cache()
|
126 |
+
|
127 |
+
# Example of how to use the function
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
model = load_model(base_model_name, quantization=True)
|
133 |
+
|
134 |
+
tokenizer = LlamaTokenizer.from_pretrained(adapters_name)
|
135 |
+
embedding_size = model.get_input_embeddings().weight.shape[0]
|
136 |
+
|
137 |
+
if len(tokenizer) != embedding_size:
|
138 |
+
print("resize the embedding size by the size of the tokenizer")
|
139 |
+
model.resize_token_embeddings(len(tokenizer))
|
140 |
+
|
141 |
+
|
142 |
+
# Load adapter model
|
143 |
+
model.load_adapter(adapters_name)
|
144 |
+
|
145 |
+
BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
max_new_tokens=100 # The maximum numbers of tokens to generate
|
151 |
+
seed=42 # seed value for reproducibility
|
152 |
+
do_sample=True # Whether or not to use sampling; use greedy decoding otherwise.
|
153 |
+
min_length=None # The minimum length of the sequence to be generated
|
154 |
+
use_cache=True # [optional] Whether or not the model should use the past last key/values attentions
|
155 |
+
top_p=1.0 # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
|
156 |
+
temperature=1.0 # [optional] The value used to modulate the next token probabilities.
|
157 |
+
top_k=5 # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
158 |
+
repetition_penalty=5.0 # The parameter for repetition penalty. 1.0 means no penalty.
|
159 |
+
length_penalty=1 # [optional] Exponential penalty to the length used with beam-based generation.
|
160 |
+
enable_azure_content_safety=False # Enable safety check with Azure content safety API
|
161 |
+
enable_sensitive_topics=False # Enable check for sensitive topics using AuditNLG APIs
|
162 |
+
enable_saleforce_content_safety=False
|
163 |
+
|
164 |
+
|
165 |
+
def predict(instruction,input_text=" "):
|
166 |
+
user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
|
167 |
+
|
168 |
+
batch = tokenizer(user_prompt, return_tensors="pt")
|
169 |
+
batch = {k: v.to(model.device) for k, v in batch.items()} # Ensure tensors are on the same device as the model
|
170 |
+
|
171 |
+
start = time.perf_counter()
|
172 |
+
|
173 |
+
# print(batch)
|
174 |
+
|
175 |
+
with torch.no_grad():
|
176 |
+
outputs = model.generate(
|
177 |
+
**batch,
|
178 |
+
max_new_tokens=max_new_tokens,
|
179 |
+
do_sample=do_sample,
|
180 |
+
top_p=top_p,
|
181 |
+
temperature=temperature,
|
182 |
+
min_length=min_length,
|
183 |
+
use_cache=use_cache,
|
184 |
+
top_k=top_k,
|
185 |
+
repetition_penalty=repetition_penalty,
|
186 |
+
length_penalty=length_penalty)
|
187 |
+
|
188 |
+
e2e_inference_time = (time.perf_counter() - start) * 1000
|
189 |
+
# print(f"Inference time: {e2e_inference_time} ms")
|
190 |
+
|
191 |
+
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):]
|
192 |
+
|
193 |
+
return output_text
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
+
st.title('LLM Interaction Interface')
|
199 |
+
|
200 |
+
user_input = st.text_input("Ask a question:")
|
201 |
+
|
202 |
+
if user_input:
|
203 |
+
# This function is supposed to send the question to the LLM and get the response
|
204 |
+
response = predict(user_input)
|
205 |
+
st.text_area("Response:", value=response, height=300, max_chars=None, help=None)
|
206 |
+
# st.json({'value':response},expanded=False)
|