Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,25 +1,13 @@
|
|
1 |
import os
|
2 |
import json
|
3 |
import streamlit as st
|
4 |
-
from transformers import AutoTokenizer,
|
5 |
-
from
|
6 |
import torch
|
7 |
from huggingface_hub import Repository, HfFolder
|
8 |
import subprocess
|
9 |
|
10 |
-
#
|
11 |
-
hf_token = st.secrets["HF_TOKEN"]
|
12 |
-
HfFolder.save_token(hf_token)
|
13 |
-
|
14 |
-
# Set Git user identity
|
15 |
-
def set_git_config():
|
16 |
-
try:
|
17 |
-
subprocess.run(['git', 'config', '--global', 'user.email', 'nilesh.hanotia@outlook.com'], check=True)
|
18 |
-
subprocess.run(['git', 'config', '--global', 'user.name', 'Nilesh'], check=True)
|
19 |
-
except subprocess.CalledProcessError as e:
|
20 |
-
st.error(f"Git configuration error: {str(e)}")
|
21 |
-
|
22 |
-
set_git_config()
|
23 |
|
24 |
@st.cache_data
|
25 |
def load_data(file_path):
|
@@ -35,10 +23,10 @@ def load_data(file_path):
|
|
35 |
return None
|
36 |
|
37 |
@st.cache_resource
|
38 |
-
def initialize_model_and_tokenizer(model_name):
|
39 |
try:
|
40 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
41 |
-
model =
|
42 |
|
43 |
if tokenizer.pad_token is None:
|
44 |
tokenizer.pad_token = tokenizer.eos_token
|
@@ -48,38 +36,18 @@ def initialize_model_and_tokenizer(model_name):
|
|
48 |
except Exception as e:
|
49 |
st.error(f"Error initializing model and tokenizer: {str(e)}")
|
50 |
return None, None
|
51 |
-
|
52 |
-
def create_dataset(data, tokenizer, max_length):
|
53 |
-
inputs = []
|
54 |
-
for item in data:
|
55 |
-
prompt = item['prompt']
|
56 |
-
response = item['response']
|
57 |
-
full_text = f"Human: {prompt}\nAssistant: {response}"
|
58 |
-
encoded = tokenizer.encode_plus(
|
59 |
-
full_text,
|
60 |
-
max_length=max_length,
|
61 |
-
padding='max_length',
|
62 |
-
truncation=True,
|
63 |
-
return_tensors='pt'
|
64 |
-
)
|
65 |
-
inputs.append({
|
66 |
-
'input_ids': encoded['input_ids'].squeeze(),
|
67 |
-
'attention_mask': encoded['attention_mask'].squeeze(),
|
68 |
-
})
|
69 |
-
return inputs
|
70 |
-
|
71 |
-
class SimpleDataset(torch.utils.data.Dataset):
|
72 |
-
def __init__(self, encodings):
|
73 |
-
self.encodings = encodings
|
74 |
-
|
75 |
-
def __getitem__(self, idx):
|
76 |
-
encoding = self.encodings[idx]
|
77 |
-
# Debugging
|
78 |
-
print(f"Encoding keys: {encoding.keys()}")
|
79 |
-
return encoding
|
80 |
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
def main():
|
85 |
st.title("Model Training with Streamlit")
|
@@ -90,10 +58,11 @@ def main():
|
|
90 |
num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
|
91 |
batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
|
92 |
learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
|
|
|
93 |
|
94 |
repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
|
95 |
|
96 |
-
tokenizer, model = initialize_model_and_tokenizer(model_name)
|
97 |
|
98 |
if tokenizer is None or model is None:
|
99 |
st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
|
@@ -106,50 +75,33 @@ def main():
|
|
106 |
st.warning("Failed to load dataset. Please check the file path and try again.")
|
107 |
return
|
108 |
|
109 |
-
st.write("
|
110 |
-
|
111 |
-
|
112 |
-
dataset = SimpleDataset(tokenized_dataset)
|
113 |
|
114 |
training_args = TrainingArguments(
|
115 |
output_dir='./results',
|
116 |
-
evaluation_strategy='
|
117 |
learning_rate=learning_rate,
|
118 |
per_device_train_batch_size=batch_size,
|
119 |
-
per_device_eval_batch_size=batch_size,
|
120 |
num_train_epochs=num_epochs,
|
121 |
weight_decay=0.01,
|
122 |
logging_dir='./logs',
|
123 |
logging_steps=10,
|
|
|
|
|
124 |
)
|
125 |
|
126 |
trainer = Trainer(
|
127 |
model=model,
|
128 |
args=training_args,
|
129 |
train_dataset=dataset,
|
130 |
-
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
|
131 |
)
|
132 |
|
133 |
if st.button('Start Training'):
|
134 |
st.write("Starting training...")
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
for epoch in range(int(num_epochs)):
|
140 |
-
trainer.train()
|
141 |
-
progress = (epoch + 1) / num_epochs
|
142 |
-
progress_bar.progress(progress)
|
143 |
-
|
144 |
-
model_path = f"./results/model_epoch_{epoch+1}"
|
145 |
-
trainer.save_model(model_path)
|
146 |
-
tokenizer.save_pretrained(model_path) # Save the tokenizer
|
147 |
-
st.write(f"Model and tokenizer saved locally: {model_path}")
|
148 |
-
|
149 |
-
repo.push_to_hub(commit_message=f"Model after epoch {epoch+1}")
|
150 |
-
st.write(f"Model pushed to Hugging Face Hub: {repo_id}")
|
151 |
-
|
152 |
-
st.write("Training complete. Model is available on the Hugging Face Hub.")
|
153 |
|
154 |
if __name__ == "__main__":
|
155 |
-
main()
|
|
|
1 |
import os
|
2 |
import json
|
3 |
import streamlit as st
|
4 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
|
5 |
+
from datasets import Dataset
|
6 |
import torch
|
7 |
from huggingface_hub import Repository, HfFolder
|
8 |
import subprocess
|
9 |
|
10 |
+
# ... (keep the authentication and git config parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
@st.cache_data
|
13 |
def load_data(file_path):
|
|
|
23 |
return None
|
24 |
|
25 |
@st.cache_resource
|
26 |
+
def initialize_model_and_tokenizer(model_name, num_labels):
|
27 |
try:
|
28 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
30 |
|
31 |
if tokenizer.pad_token is None:
|
32 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
36 |
except Exception as e:
|
37 |
st.error(f"Error initializing model and tokenizer: {str(e)}")
|
38 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
def create_dataset(data, tokenizer, max_length):
|
41 |
+
texts = [f"Human: {item['prompt']}\nAssistant: {item['response']}" for item in data]
|
42 |
+
labels = [item['label'] for item in data] # Ensure your data has 'label' field
|
43 |
+
|
44 |
+
encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
|
45 |
+
dataset = Dataset.from_dict({
|
46 |
+
'input_ids': encodings['input_ids'],
|
47 |
+
'attention_mask': encodings['attention_mask'],
|
48 |
+
'labels': labels
|
49 |
+
})
|
50 |
+
return dataset
|
51 |
|
52 |
def main():
|
53 |
st.title("Model Training with Streamlit")
|
|
|
58 |
num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
|
59 |
batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
|
60 |
learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
|
61 |
+
num_labels = st.number_input("Enter number of labels", min_value=2, max_value=10, value=2)
|
62 |
|
63 |
repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
|
64 |
|
65 |
+
tokenizer, model = initialize_model_and_tokenizer(model_name, num_labels)
|
66 |
|
67 |
if tokenizer is None or model is None:
|
68 |
st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
|
|
|
75 |
st.warning("Failed to load dataset. Please check the file path and try again.")
|
76 |
return
|
77 |
|
78 |
+
st.write("Preparing dataset...")
|
79 |
+
dataset = create_dataset(data, tokenizer, max_length)
|
|
|
|
|
80 |
|
81 |
training_args = TrainingArguments(
|
82 |
output_dir='./results',
|
83 |
+
evaluation_strategy='epoch',
|
84 |
learning_rate=learning_rate,
|
85 |
per_device_train_batch_size=batch_size,
|
|
|
86 |
num_train_epochs=num_epochs,
|
87 |
weight_decay=0.01,
|
88 |
logging_dir='./logs',
|
89 |
logging_steps=10,
|
90 |
+
push_to_hub=True,
|
91 |
+
hub_model_id=repo_id,
|
92 |
)
|
93 |
|
94 |
trainer = Trainer(
|
95 |
model=model,
|
96 |
args=training_args,
|
97 |
train_dataset=dataset,
|
|
|
98 |
)
|
99 |
|
100 |
if st.button('Start Training'):
|
101 |
st.write("Starting training...")
|
102 |
+
trainer.train()
|
103 |
+
trainer.push_to_hub()
|
104 |
+
st.write(f"Training complete. Model is available on the Hugging Face Hub: {repo_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
if __name__ == "__main__":
|
107 |
+
main()
|