Spaces:
Sleeping
Sleeping
updated app
Browse files- README.md +1 -1
- __init__.py +0 -0
- data/__init__.py +0 -0
- data/__pycache__/__init__.cpython-310.pyc +0 -0
- data/__pycache__/fine_tune_dataset.cpython-310.pyc +0 -0
- data/fine_tune_dataset.py +31 -0
- fine_tuning_app.py +126 -0
- models/__init__.py +0 -0
- requirements.txt +10 -2
- scripts/__init__.py +0 -0
- scripts/__pycache__/__init__.cpython-310.pyc +0 -0
- scripts/__pycache__/finetune.cpython-310.pyc +0 -0
- scripts/finetune.py +86 -0
- app.py β telco_app.py +27 -6
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: gray
|
|
5 |
colorTo: pink
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.38.0
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
|
|
5 |
colorTo: pink
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.38.0
|
8 |
+
app_file: ./app/fine_tuning_app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
__init__.py
ADDED
File without changes
|
data/__init__.py
ADDED
File without changes
|
data/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (133 Bytes). View file
|
|
data/__pycache__/fine_tune_dataset.cpython-310.pyc
ADDED
Binary file (1.13 kB). View file
|
|
data/fine_tune_dataset.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import torch
|
2 |
+
# from trl import SFTTrainer
|
3 |
+
from datasets import load_dataset
|
4 |
+
# from transformers import TrainingArguments, TextStreamer
|
5 |
+
from unsloth.chat_templates import get_chat_template
|
6 |
+
# from unsloth import FastLanguageModel, is_bfloat16_supported
|
7 |
+
|
8 |
+
|
9 |
+
def load_data(dataset, tokenizer, samples=None):
|
10 |
+
print("Loading finetuning dataset.")
|
11 |
+
|
12 |
+
# Base models don't have chat templates so we can choose any - ChatML is popular
|
13 |
+
tokenizer = get_chat_template(tokenizer,
|
14 |
+
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
|
15 |
+
chat_template="chatml",
|
16 |
+
)
|
17 |
+
|
18 |
+
def apply_template(examples):
|
19 |
+
# Ensuring we parse the ShareGPT reformat datasets into the format we want
|
20 |
+
messages = examples["conversations"]
|
21 |
+
text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
|
22 |
+
return {"text": text}
|
23 |
+
|
24 |
+
|
25 |
+
if samples is not None:
|
26 |
+
# Reducing the training load by only training on a subset
|
27 |
+
dataset = load_dataset(dataset, split=f"train[:{int(samples)}]")
|
28 |
+
else:
|
29 |
+
dataset = load_dataset(dataset, split="train")
|
30 |
+
|
31 |
+
return dataset.map(apply_template, batched=True)
|
fine_tuning_app.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" fine_tuning_app.py
|
2 |
+
|
3 |
+
Running a basic chatbot app that can compare base and fine-tuned models from Hugging face.
|
4 |
+
|
5 |
+
Note:
|
6 |
+
- run using streamlit run fine_tuning_app.py
|
7 |
+
- use free -h then sudo sysctl vm.drop_caches=2 to ensure I have cache space but this can mess up the venv
|
8 |
+
- may need to run huggingface-cli login in terminal to enable access to model
|
9 |
+
- Or: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/130 for above
|
10 |
+
- Hugging face can use up a lot of disc space - cd ~/.cache/huggingface/hub then rm -rf <subdir>
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import streamlit as st
|
15 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
16 |
+
import transformers
|
17 |
+
import time
|
18 |
+
import torch
|
19 |
+
from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3
|
20 |
+
|
21 |
+
# ---------------------------------------------------------------------------------------
|
22 |
+
# GENERAL SETUP:
|
23 |
+
# ---------------------------------------------------------------------------------------
|
24 |
+
|
25 |
+
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
26 |
+
hf_token = ""
|
27 |
+
# model_name = "thebigoed/PreFineLlama-3.1-8B" # this works badly as it does not know chat structure
|
28 |
+
# model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # this is what we were fine tuning - also bad without chat instruct
|
29 |
+
# model_name = "Qwen/Qwen2.5-7B-Instruct" # working well now
|
30 |
+
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # very effective. NB: if using fine grained access token, make sure it can access gated repos
|
31 |
+
st.title("Fine Tuning Testing")
|
32 |
+
col1, col2 = st.columns(2)
|
33 |
+
if 'conversation' not in st.session_state:
|
34 |
+
st.session_state.conversation = []
|
35 |
+
user_input = st.text_input("You:", "") # user input
|
36 |
+
|
37 |
+
def print_gpu_utilization():
|
38 |
+
# Used for basic resource monioring.
|
39 |
+
nvmlInit()
|
40 |
+
handle = nvmlDeviceGetHandleByIndex(0)
|
41 |
+
info = nvmlDeviceGetMemoryInfo(handle)
|
42 |
+
print(f"GPU memory occupied: {info.used//1024**2} MB.")
|
43 |
+
|
44 |
+
# ---------------------------------------------------------------------------------------
|
45 |
+
# MODEL SETUP:
|
46 |
+
# ---------------------------------------------------------------------------------------
|
47 |
+
|
48 |
+
@st.cache_resource(show_spinner=False)
|
49 |
+
def load_model():
|
50 |
+
""" Load model from Hugging face."""
|
51 |
+
print_gpu_utilization()
|
52 |
+
# see https://huggingface.co/mlabonne/FineLlama-3.1-8B for how to run
|
53 |
+
# https://huggingface.co/docs/transformers/main/en/chat_templating look into this to decide on how we do templating
|
54 |
+
success_placeholder = st.empty()
|
55 |
+
with st.spinner("Loading model... please wait"):
|
56 |
+
if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
|
58 |
+
else:
|
59 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
|
60 |
+
|
61 |
+
model = AutoModelForCausalLM.from_pretrained(model_name,
|
62 |
+
torch_dtype="auto",
|
63 |
+
device_map="auto"
|
64 |
+
)
|
65 |
+
|
66 |
+
# Not using terminators at the moment
|
67 |
+
#terminator = tokenizer.eos_token if tokenizer.eos_token else "<|endoftext|>"
|
68 |
+
|
69 |
+
success_placeholder.success("Model loaded successfully!", icon="π₯")
|
70 |
+
time.sleep(2)
|
71 |
+
success_placeholder.empty()
|
72 |
+
print_gpu_utilization()
|
73 |
+
return model, tokenizer
|
74 |
+
|
75 |
+
|
76 |
+
def generate_response():
|
77 |
+
""" Query the model. """
|
78 |
+
|
79 |
+
success_placeholder = st.empty()
|
80 |
+
with st.spinner("Thinking..."):
|
81 |
+
|
82 |
+
# Tokenising the conversation
|
83 |
+
if tokenizer.chat_template:
|
84 |
+
text = tokenizer.apply_chat_template(st.session_state.conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
|
85 |
+
else: # base models do not have chat templates
|
86 |
+
print("Assuming base model.")
|
87 |
+
model_input = ""
|
88 |
+
for entry in st.session_state.conversation:
|
89 |
+
model_input += f"{entry['role']}: {entry['content']}\n"
|
90 |
+
text = tokenizer(model_input + "assistant: ", return_tensors="pt")["input_ids"].to(DEVICE)
|
91 |
+
outputs = model.generate(text,
|
92 |
+
max_new_tokens=512,
|
93 |
+
)
|
94 |
+
outputs = tokenizer.batch_decode(outputs[:,text.shape[1]:], skip_special_tokens=True)[0]
|
95 |
+
print_gpu_utilization()
|
96 |
+
|
97 |
+
success_placeholder.success("Response generated!", icon="β
")
|
98 |
+
time.sleep(2)
|
99 |
+
success_placeholder.empty()
|
100 |
+
return outputs
|
101 |
+
|
102 |
+
# ---------------------------------------------------------------------------------------
|
103 |
+
# RUNTIME EVENTS:
|
104 |
+
# ---------------------------------------------------------------------------------------
|
105 |
+
|
106 |
+
model, tokenizer = load_model()
|
107 |
+
|
108 |
+
# Submit button to send the query
|
109 |
+
with col1:
|
110 |
+
if st.button("send"):
|
111 |
+
if user_input:
|
112 |
+
st.session_state.conversation.append({"role": "user", "content": user_input})
|
113 |
+
st.session_state.conversation.append({"role": "assistant", "content": generate_response()})
|
114 |
+
|
115 |
+
# Clear button to reset
|
116 |
+
with col2:
|
117 |
+
if st.button("clear chat"):
|
118 |
+
if user_input:
|
119 |
+
st.session_state.conversation = []
|
120 |
+
|
121 |
+
# Display conversation history
|
122 |
+
for chat in st.session_state.conversation:
|
123 |
+
if chat['role'] == 'user':
|
124 |
+
st.write(f"You: {chat['content']}")
|
125 |
+
else:
|
126 |
+
st.write(f"Assistant: {chat['content']}")
|
models/__init__.py
ADDED
File without changes
|
requirements.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1 |
-
streamlit
|
2 |
transformers
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
transformers
|
2 |
+
pytorch
|
3 |
+
unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
|
4 |
+
xformers
|
5 |
+
trl
|
6 |
+
bitsandbytes
|
7 |
+
peft
|
8 |
+
accelerate
|
9 |
+
streamlit
|
10 |
+
nvidia-ml-py3
|
11 |
+
huggingface_hub[cli]
|
scripts/__init__.py
ADDED
File without changes
|
scripts/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (136 Bytes). View file
|
|
scripts/__pycache__/finetune.cpython-310.pyc
ADDED
Binary file (1.95 kB). View file
|
|
scripts/finetune.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# run as a module using: python3 -m scripts.finetune
|
2 |
+
|
3 |
+
# Using: https://huggingface.co/blog/mlabonne/sft-llama3
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from trl import SFTTrainer
|
7 |
+
from datasets import load_dataset
|
8 |
+
from transformers import TrainingArguments, TextStreamer
|
9 |
+
from unsloth.chat_templates import get_chat_template
|
10 |
+
from unsloth import FastLanguageModel, is_bfloat16_supported
|
11 |
+
|
12 |
+
from data.fine_tune_dataset import load_data
|
13 |
+
|
14 |
+
def finetune(model="unsloth/Meta-Llama-3.1-8B-bnb-4bit", dataset="mlabonne/FineTome-100k"):
|
15 |
+
|
16 |
+
hf_token = ""
|
17 |
+
|
18 |
+
# Loading the model and restricting context window
|
19 |
+
max_seq_length = 2048
|
20 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
21 |
+
model_name=model,
|
22 |
+
max_seq_length=max_seq_length,
|
23 |
+
load_in_4bit=True,
|
24 |
+
dtype=None,
|
25 |
+
)
|
26 |
+
|
27 |
+
# Loading prepared dataset
|
28 |
+
dataset = load_data(dataset, tokenizer)
|
29 |
+
|
30 |
+
# Loading the model for fine tuning - only set to FT 42million/8billion parameters
|
31 |
+
model = FastLanguageModel.get_peft_model(
|
32 |
+
model,
|
33 |
+
r=16, # rank determines LoRA (Low rank adaptation - freezing much of the model for fine tuning) matrix size, higher increases memory and compute cost
|
34 |
+
lora_alpha=16, # scaling factor for updates
|
35 |
+
lora_dropout=0, # not used for speedup
|
36 |
+
target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], # where LoRA targets
|
37 |
+
use_rslora=True, # rank stabilised
|
38 |
+
use_gradient_checkpointing="unsloth"
|
39 |
+
)
|
40 |
+
|
41 |
+
# Saving the untrained model, save_method can be lora to only save adapters or merged (16 or 4 bit)
|
42 |
+
model.save_pretrained_merged("models/PreFineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally
|
43 |
+
model.push_to_hub_merged("thebigoed/PreFineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit")
|
44 |
+
|
45 |
+
trainer=SFTTrainer(
|
46 |
+
model=model,
|
47 |
+
tokenizer=tokenizer,
|
48 |
+
train_dataset=dataset,
|
49 |
+
dataset_text_field="text",
|
50 |
+
max_seq_length=max_seq_length,
|
51 |
+
dataset_num_proc=2,
|
52 |
+
packing=True,
|
53 |
+
args=TrainingArguments(
|
54 |
+
learning_rate=3e-4, # to low = slow and local minima, too high = unstable
|
55 |
+
lr_scheduler_type="linear", # adjusts the learning rate (linear and cosine are most popular)
|
56 |
+
per_device_train_batch_size=8,
|
57 |
+
gradient_accumulation_steps=2,
|
58 |
+
num_train_epochs=1,
|
59 |
+
fp16=not is_bfloat16_supported(),
|
60 |
+
bf16=is_bfloat16_supported(),
|
61 |
+
logging_steps=1,
|
62 |
+
optim="adamw_8bit",
|
63 |
+
weight_decay=0.01,
|
64 |
+
warmup_steps=10,
|
65 |
+
output_dir="output",
|
66 |
+
seed=0,
|
67 |
+
),
|
68 |
+
)
|
69 |
+
|
70 |
+
trainer.train()
|
71 |
+
|
72 |
+
# Saving the model, save_method can be lora to only save adapters or merged (16 or 4 bit)
|
73 |
+
model.save_pretrained_merged("models/FineLlama-3.1-8B", tokenizer, save_method="merged_16bit") # save to models directory locally
|
74 |
+
model.push_to_hub_merged("thebigoed/FineLlama-3.1-8B", tokenizer, token=hf_token, save_method="merged_16bit")
|
75 |
+
|
76 |
+
# Use to save in GGUF quantised format
|
77 |
+
# quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]
|
78 |
+
# for quant in quant_methods:
|
79 |
+
# model.push_to_hub_gguf("", tokenizer, quant)
|
80 |
+
|
81 |
+
return
|
82 |
+
|
83 |
+
if __name__ == "__main__":
|
84 |
+
finetune()
|
85 |
+
|
86 |
+
|
app.py β telco_app.py
RENAMED
@@ -1,6 +1,11 @@
|
|
1 |
import streamlit as st
|
2 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
3 |
import time
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# Streamlit setup
|
6 |
st.title("Telco Chat Bot")
|
@@ -11,16 +16,30 @@ if 'conversation' not in st.session_state:
|
|
11 |
st.session_state.conversation = []
|
12 |
user_input = st.text_input("You:", "") # user input
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Model functions:
|
16 |
@st.cache_resource(show_spinner=False)
|
17 |
def load_model():
|
18 |
""" Load model from Hugging face."""
|
|
|
19 |
success_placeholder = st.empty()
|
20 |
with st.spinner("Loading model... please wait"):
|
21 |
-
model_name = "AliMaatouk/TinyLlama-1.1B-Tele" # Replace with the correct model name
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
success_placeholder.success("Model loaded successfully!", icon="π₯")
|
25 |
time.sleep(2)
|
26 |
success_placeholder.empty()
|
@@ -30,14 +49,16 @@ def generate_response(user_input):
|
|
30 |
""" Query the model. """
|
31 |
success_placeholder = st.empty()
|
32 |
with st.spinner("Thinking..."):
|
33 |
-
inputs = tokenizer(user_input, return_tensors="pt")
|
34 |
#outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
|
35 |
-
outputs = model.generate(**inputs, max_new_tokens=
|
|
|
36 |
generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
|
37 |
success_placeholder.success("Response generated!", icon="β
")
|
38 |
time.sleep(2)
|
39 |
success_placeholder.empty()
|
40 |
-
|
|
|
41 |
|
42 |
# RUNTIME EVENTS:
|
43 |
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
3 |
import time
|
4 |
+
import torch
|
5 |
+
from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3
|
6 |
+
|
7 |
+
# Get device
|
8 |
+
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
9 |
|
10 |
# Streamlit setup
|
11 |
st.title("Telco Chat Bot")
|
|
|
16 |
st.session_state.conversation = []
|
17 |
user_input = st.text_input("You:", "") # user input
|
18 |
|
19 |
+
# Resource monitoring:
|
20 |
+
def print_gpu_utilization():
|
21 |
+
nvmlInit()
|
22 |
+
handle = nvmlDeviceGetHandleByIndex(0)
|
23 |
+
info = nvmlDeviceGetMemoryInfo(handle)
|
24 |
+
print(f"GPU memory occupied: {info.used//1024**2} MB.")
|
25 |
+
|
26 |
|
27 |
# Model functions:
|
28 |
@st.cache_resource(show_spinner=False)
|
29 |
def load_model():
|
30 |
""" Load model from Hugging face."""
|
31 |
+
print_gpu_utilization()
|
32 |
success_placeholder = st.empty()
|
33 |
with st.spinner("Loading model... please wait"):
|
34 |
+
#model_name = "AliMaatouk/TinyLlama-1.1B-Tele" # Replace with the correct model name
|
35 |
+
#model_name = "AliMaatouk/LLama-3-8B-Tele-it"
|
36 |
+
model_name = "AliMaatouk/Gemma-2B-Tele"
|
37 |
+
if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway
|
38 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
|
39 |
+
else:
|
40 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto")
|
41 |
+
model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
|
42 |
+
|
43 |
success_placeholder.success("Model loaded successfully!", icon="π₯")
|
44 |
time.sleep(2)
|
45 |
success_placeholder.empty()
|
|
|
49 |
""" Query the model. """
|
50 |
success_placeholder = st.empty()
|
51 |
with st.spinner("Thinking..."):
|
52 |
+
inputs = tokenizer(user_input, return_tensors="pt").to(DEVICE)
|
53 |
#outputs = model.generate(**inputs, max_length=1000, pad_token_id=tokenizer.eos_token_id)
|
54 |
+
outputs = model.generate(**inputs, max_new_tokens=750)
|
55 |
+
print_gpu_utilization()
|
56 |
generated_tokens = outputs[0, len(inputs['input_ids'][0]):]
|
57 |
success_placeholder.success("Response generated!", icon="β
")
|
58 |
time.sleep(2)
|
59 |
success_placeholder.empty()
|
60 |
+
text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
61 |
+
return text
|
62 |
|
63 |
# RUNTIME EVENTS:
|
64 |
|