nileshhanotia commited on
Commit
145afb1
·
verified ·
1 Parent(s): b074295

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -75
app.py CHANGED
@@ -1,25 +1,13 @@
1
  import os
2
  import json
3
  import streamlit as st
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
5
- from transformers import DataCollatorForLanguageModeling
6
  import torch
7
  from huggingface_hub import Repository, HfFolder
8
  import subprocess
9
 
10
- # Authenticate Hugging Face Hub
11
- hf_token = st.secrets["HF_TOKEN"]
12
- HfFolder.save_token(hf_token)
13
-
14
- # Set Git user identity
15
- def set_git_config():
16
- try:
17
- subprocess.run(['git', 'config', '--global', 'user.email', 'nilesh.hanotia@outlook.com'], check=True)
18
- subprocess.run(['git', 'config', '--global', 'user.name', 'Nilesh'], check=True)
19
- except subprocess.CalledProcessError as e:
20
- st.error(f"Git configuration error: {str(e)}")
21
-
22
- set_git_config()
23
 
24
  @st.cache_data
25
  def load_data(file_path):
@@ -35,10 +23,10 @@ def load_data(file_path):
35
  return None
36
 
37
  @st.cache_resource
38
- def initialize_model_and_tokenizer(model_name):
39
  try:
40
  tokenizer = AutoTokenizer.from_pretrained(model_name)
41
- model = AutoModelForCausalLM.from_pretrained(model_name)
42
 
43
  if tokenizer.pad_token is None:
44
  tokenizer.pad_token = tokenizer.eos_token
@@ -48,38 +36,18 @@ def initialize_model_and_tokenizer(model_name):
48
  except Exception as e:
49
  st.error(f"Error initializing model and tokenizer: {str(e)}")
50
  return None, None
51
-
52
- def create_dataset(data, tokenizer, max_length):
53
- inputs = []
54
- for item in data:
55
- prompt = item['prompt']
56
- response = item['response']
57
- full_text = f"Human: {prompt}\nAssistant: {response}"
58
- encoded = tokenizer.encode_plus(
59
- full_text,
60
- max_length=max_length,
61
- padding='max_length',
62
- truncation=True,
63
- return_tensors='pt'
64
- )
65
- inputs.append({
66
- 'input_ids': encoded['input_ids'].squeeze(),
67
- 'attention_mask': encoded['attention_mask'].squeeze(),
68
- })
69
- return inputs
70
-
71
- class SimpleDataset(torch.utils.data.Dataset):
72
- def __init__(self, encodings):
73
- self.encodings = encodings
74
-
75
- def __getitem__(self, idx):
76
- encoding = self.encodings[idx]
77
- # Debugging
78
- print(f"Encoding keys: {encoding.keys()}")
79
- return encoding
80
 
81
- def __len__(self):
82
- return len(self.encodings)
 
 
 
 
 
 
 
 
 
83
 
84
  def main():
85
  st.title("Model Training with Streamlit")
@@ -90,10 +58,11 @@ def main():
90
  num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
91
  batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
92
  learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
 
93
 
94
  repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
95
 
96
- tokenizer, model = initialize_model_and_tokenizer(model_name)
97
 
98
  if tokenizer is None or model is None:
99
  st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
@@ -106,50 +75,33 @@ def main():
106
  st.warning("Failed to load dataset. Please check the file path and try again.")
107
  return
108
 
109
- st.write("Tokenizing dataset...")
110
- tokenized_dataset = create_dataset(data, tokenizer, max_length)
111
-
112
- dataset = SimpleDataset(tokenized_dataset)
113
 
114
  training_args = TrainingArguments(
115
  output_dir='./results',
116
- evaluation_strategy='no',
117
  learning_rate=learning_rate,
118
  per_device_train_batch_size=batch_size,
119
- per_device_eval_batch_size=batch_size,
120
  num_train_epochs=num_epochs,
121
  weight_decay=0.01,
122
  logging_dir='./logs',
123
  logging_steps=10,
 
 
124
  )
125
 
126
  trainer = Trainer(
127
  model=model,
128
  args=training_args,
129
  train_dataset=dataset,
130
- data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
131
  )
132
 
133
  if st.button('Start Training'):
134
  st.write("Starting training...")
135
- progress_bar = st.progress(0)
136
-
137
- repo = Repository(local_dir="./results", clone_from=repo_id)
138
-
139
- for epoch in range(int(num_epochs)):
140
- trainer.train()
141
- progress = (epoch + 1) / num_epochs
142
- progress_bar.progress(progress)
143
-
144
- model_path = f"./results/model_epoch_{epoch+1}"
145
- trainer.save_model(model_path)
146
- tokenizer.save_pretrained(model_path) # Save the tokenizer
147
- st.write(f"Model and tokenizer saved locally: {model_path}")
148
-
149
- repo.push_to_hub(commit_message=f"Model after epoch {epoch+1}")
150
- st.write(f"Model pushed to Hugging Face Hub: {repo_id}")
151
-
152
- st.write("Training complete. Model is available on the Hugging Face Hub.")
153
 
154
  if __name__ == "__main__":
155
- main()
 
1
  import os
2
  import json
3
  import streamlit as st
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
+ from datasets import Dataset
6
  import torch
7
  from huggingface_hub import Repository, HfFolder
8
  import subprocess
9
 
10
+ # ... (keep the authentication and git config parts)
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  @st.cache_data
13
  def load_data(file_path):
 
23
  return None
24
 
25
  @st.cache_resource
26
+ def initialize_model_and_tokenizer(model_name, num_labels):
27
  try:
28
  tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
30
 
31
  if tokenizer.pad_token is None:
32
  tokenizer.pad_token = tokenizer.eos_token
 
36
  except Exception as e:
37
  st.error(f"Error initializing model and tokenizer: {str(e)}")
38
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ def create_dataset(data, tokenizer, max_length):
41
+ texts = [f"Human: {item['prompt']}\nAssistant: {item['response']}" for item in data]
42
+ labels = [item['label'] for item in data] # Ensure your data has 'label' field
43
+
44
+ encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
45
+ dataset = Dataset.from_dict({
46
+ 'input_ids': encodings['input_ids'],
47
+ 'attention_mask': encodings['attention_mask'],
48
+ 'labels': labels
49
+ })
50
+ return dataset
51
 
52
  def main():
53
  st.title("Model Training with Streamlit")
 
58
  num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
59
  batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
60
  learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
61
+ num_labels = st.number_input("Enter number of labels", min_value=2, max_value=10, value=2)
62
 
63
  repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
64
 
65
+ tokenizer, model = initialize_model_and_tokenizer(model_name, num_labels)
66
 
67
  if tokenizer is None or model is None:
68
  st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
 
75
  st.warning("Failed to load dataset. Please check the file path and try again.")
76
  return
77
 
78
+ st.write("Preparing dataset...")
79
+ dataset = create_dataset(data, tokenizer, max_length)
 
 
80
 
81
  training_args = TrainingArguments(
82
  output_dir='./results',
83
+ evaluation_strategy='epoch',
84
  learning_rate=learning_rate,
85
  per_device_train_batch_size=batch_size,
 
86
  num_train_epochs=num_epochs,
87
  weight_decay=0.01,
88
  logging_dir='./logs',
89
  logging_steps=10,
90
+ push_to_hub=True,
91
+ hub_model_id=repo_id,
92
  )
93
 
94
  trainer = Trainer(
95
  model=model,
96
  args=training_args,
97
  train_dataset=dataset,
 
98
  )
99
 
100
  if st.button('Start Training'):
101
  st.write("Starting training...")
102
+ trainer.train()
103
+ trainer.push_to_hub()
104
+ st.write(f"Training complete. Model is available on the Hugging Face Hub: {repo_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  if __name__ == "__main__":
107
+ main()