israel commited on
Commit
426b9bb
1 Parent(s): a745721

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import os
4
+ import sys
5
+ import time
6
+ import json
7
+ from typing import List
8
+ import datasets
9
+ import csv
10
+ from transformers import LlamaTokenizer, LlamaForCausalLM
11
+ import tqdm
12
+
13
+
14
+ base_model_name = "EthioNLP/Amharic-llama-base-model"
15
+ adapters_name = 'EthioNLP/Amharic-LLAMA-all-data'
16
+
17
+
18
+
19
+
20
+ BASE_PROMPT = """Below is an interaction between a human and an AI fluent in English and Amharic, providing reliable and informative answers. The AI is supposed to answer test questions from the human with short responses saying just the answer and nothing else.
21
+
22
+ Human: {instruction}
23
+
24
+ Assistant [Amharic] : """
25
+
26
+
27
+
28
+ from peft import PeftModel
29
+ from transformers import LlamaForCausalLM, LlamaConfig
30
+
31
+ # Function to load the main model for text generation
32
+ def load_model(model_name, quantization):
33
+ model = LlamaForCausalLM.from_pretrained(
34
+ model_name,
35
+ return_dict=True,
36
+ load_in_8bit=quantization,
37
+ device_map='cuda:0',
38
+ low_cpu_mem_usage=True,
39
+ )
40
+ return model
41
+
42
+
43
+ # Function to load the PeftModel for performance optimization
44
+ def load_peft_model(model, peft_model):
45
+ peft_model = PeftModel.from_pretrained(model, peft_model,offload_folder='./')
46
+ return peft_model
47
+
48
+ # Loading the model from config to load FSDP checkpoints into that
49
+ def load_llama_from_config(config_path):
50
+ model_config = LlamaConfig.from_pretrained(config_path)
51
+ model = LlamaForCausalLM(config=model_config)
52
+ return model
53
+
54
+
55
+
56
+ def main(
57
+ model,
58
+ tokenizer,
59
+ datasource, # List of data sources to use, no default value
60
+ csv_file_path, # Path to the CSV file to save responses, no default value
61
+ max_new_tokens=100, # The maximum numbers of tokens to generate
62
+ seed=42, # seed value for reproducibility
63
+ do_sample=True, # Whether or not to use sampling; use greedy decoding otherwise.
64
+ min_length=None, # The minimum length of the sequence to be generated
65
+ use_cache=True, # [optional] Whether or not the model should use the past last key/values attentions
66
+ top_p=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
67
+ temperature=1.0, # [optional] The value used to modulate the next token probabilities.
68
+ top_k=5, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
69
+ repetition_penalty=5.0, # The parameter for repetition penalty. 1.0 means no penalty.
70
+ length_penalty=1, # [optional] Exponential penalty to the length used with beam-based generation.
71
+ enable_azure_content_safety=False, # Enable safety check with Azure content safety API
72
+ enable_sensitive_topics=False, # Enable check for sensitive topics using AuditNLG APIs
73
+ enable_saleforce_content_safety=False, # Enable safety check with Salesforce safety T5
74
+ **kwargs # Additional arguments for the model.generate function
75
+ ):
76
+ # Note: Ensure that the appropriate tokenizer is used for the language.
77
+ print("*** Ensure that you have replaced the default tokenizer with the appropriate one for your use case.")
78
+
79
+ model.eval()
80
+
81
+ # Load the dataset from Hugging Face
82
+ dataset = hf_dataset['test']
83
+
84
+ # Prepare the CSV file for saving responses
85
+ with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
86
+ writer = csv.writer(file)
87
+ writer.writerow(['Instruction', 'Input Text', 'Datasource','response', 'gold_label']) # Column headers
88
+
89
+ for item in tqdm.tqdm(dataset): # Change to the desired split if necessary
90
+ instruction = item['instruction'] # Extracting the instruction
91
+ input_text = item['input'] # Extracting the input text
92
+ datasource = item['datasource']
93
+ gold_label=item['output']
94
+
95
+ # Combine instruction and input_text for the prompt
96
+ user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
97
+
98
+ batch = tokenizer(user_prompt, return_tensors="pt")
99
+ batch = {k: v.to(model.device) for k, v in batch.items()} # Ensure tensors are on the same device as the model
100
+
101
+ start = time.perf_counter()
102
+
103
+ with torch.no_grad():
104
+ outputs = model.generate(
105
+ **batch,
106
+ max_new_tokens=max_new_tokens,
107
+ do_sample=do_sample,
108
+ top_p=top_p,
109
+ temperature=temperature,
110
+ min_length=min_length,
111
+ use_cache=use_cache,
112
+ top_k=top_k,
113
+ repetition_penalty=repetition_penalty,
114
+ length_penalty=length_penalty,
115
+ **kwargs)
116
+
117
+ e2e_inference_time = (time.perf_counter() - start) * 1000
118
+ # print(f"Inference time: {e2e_inference_time} ms")
119
+
120
+ output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):]
121
+ # print("Model Output: {}".format(output_text))
122
+
123
+ # Write the instruction, input text, and output to the CSV file
124
+ writer.writerow([instruction, input_text,datasource, output_text, gold_label])
125
+ torch.cuda.empty_cache()
126
+
127
+ # Example of how to use the function
128
+
129
+
130
+
131
+
132
+ model = load_model(base_model_name, quantization=True)
133
+
134
+ tokenizer = LlamaTokenizer.from_pretrained(adapters_name)
135
+ embedding_size = model.get_input_embeddings().weight.shape[0]
136
+
137
+ if len(tokenizer) != embedding_size:
138
+ print("resize the embedding size by the size of the tokenizer")
139
+ model.resize_token_embeddings(len(tokenizer))
140
+
141
+
142
+ # Load adapter model
143
+ model.load_adapter(adapters_name)
144
+
145
+ BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
146
+
147
+
148
+
149
+
150
+ max_new_tokens=100 # The maximum numbers of tokens to generate
151
+ seed=42 # seed value for reproducibility
152
+ do_sample=True # Whether or not to use sampling; use greedy decoding otherwise.
153
+ min_length=None # The minimum length of the sequence to be generated
154
+ use_cache=True # [optional] Whether or not the model should use the past last key/values attentions
155
+ top_p=1.0 # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
156
+ temperature=1.0 # [optional] The value used to modulate the next token probabilities.
157
+ top_k=5 # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
158
+ repetition_penalty=5.0 # The parameter for repetition penalty. 1.0 means no penalty.
159
+ length_penalty=1 # [optional] Exponential penalty to the length used with beam-based generation.
160
+ enable_azure_content_safety=False # Enable safety check with Azure content safety API
161
+ enable_sensitive_topics=False # Enable check for sensitive topics using AuditNLG APIs
162
+ enable_saleforce_content_safety=False
163
+
164
+
165
+ def predict(instruction,input_text=" "):
166
+ user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}")
167
+
168
+ batch = tokenizer(user_prompt, return_tensors="pt")
169
+ batch = {k: v.to(model.device) for k, v in batch.items()} # Ensure tensors are on the same device as the model
170
+
171
+ start = time.perf_counter()
172
+
173
+ # print(batch)
174
+
175
+ with torch.no_grad():
176
+ outputs = model.generate(
177
+ **batch,
178
+ max_new_tokens=max_new_tokens,
179
+ do_sample=do_sample,
180
+ top_p=top_p,
181
+ temperature=temperature,
182
+ min_length=min_length,
183
+ use_cache=use_cache,
184
+ top_k=top_k,
185
+ repetition_penalty=repetition_penalty,
186
+ length_penalty=length_penalty)
187
+
188
+ e2e_inference_time = (time.perf_counter() - start) * 1000
189
+ # print(f"Inference time: {e2e_inference_time} ms")
190
+
191
+ output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):]
192
+
193
+ return output_text
194
+
195
+
196
+
197
+
198
+ st.title('LLM Interaction Interface')
199
+
200
+ user_input = st.text_input("Ask a question:")
201
+
202
+ if user_input:
203
+ # This function is supposed to send the question to the LLM and get the response
204
+ response = predict(user_input)
205
+ st.text_area("Response:", value=response, height=300, max_chars=None, help=None)
206
+ # st.json({'value':response},expanded=False)