Spaces:

nileshhanotia
/

PeVe_mistral

Sleeping

App Files Files Community

nileshhanotia commited on Sep 10, 2024

Commit

145afb1

verified ·

1 Parent(s): b074295

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -75

app.py CHANGED Viewed

@@ -1,25 +1,13 @@
 import os
 import json
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
-from transformers import DataCollatorForLanguageModeling
 import torch
 from huggingface_hub import Repository, HfFolder
 import subprocess
-# Authenticate Hugging Face Hub
-hf_token = st.secrets["HF_TOKEN"]
-HfFolder.save_token(hf_token)
-# Set Git user identity
-def set_git_config():
-    try:
-        subprocess.run(['git', 'config', '--global', 'user.email', 'nilesh.hanotia@outlook.com'], check=True)
-        subprocess.run(['git', 'config', '--global', 'user.name', 'Nilesh'], check=True)
-    except subprocess.CalledProcessError as e:
-        st.error(f"Git configuration error: {str(e)}")
-set_git_config()
 @st.cache_data
 def load_data(file_path):
@@ -35,10 +23,10 @@ def load_data(file_path):
         return None
 @st.cache_resource
-def initialize_model_and_tokenizer(model_name):
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
@@ -48,38 +36,18 @@ def initialize_model_and_tokenizer(model_name):
     except Exception as e:
         st.error(f"Error initializing model and tokenizer: {str(e)}")
         return None, None
-def create_dataset(data, tokenizer, max_length):
-    inputs = []
-    for item in data:
-        prompt = item['prompt']
-        response = item['response']
-        full_text = f"Human: {prompt}\nAssistant: {response}"
-        encoded = tokenizer.encode_plus(
-            full_text,
-            max_length=max_length,
-            padding='max_length',
-            truncation=True,
-            return_tensors='pt'
-        )
-        inputs.append({
-            'input_ids': encoded['input_ids'].squeeze(),
-            'attention_mask': encoded['attention_mask'].squeeze(),
-        })
-    return inputs
-class SimpleDataset(torch.utils.data.Dataset):
-    def __init__(self, encodings):
-        self.encodings = encodings
-    def __getitem__(self, idx):
-        encoding = self.encodings[idx]
-        # Debugging
-        print(f"Encoding keys: {encoding.keys()}")
-        return encoding
-    def __len__(self):
-        return len(self.encodings)
 def main():
     st.title("Model Training with Streamlit")
@@ -90,10 +58,11 @@ def main():
     num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
     batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
     learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
     repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
-    tokenizer, model = initialize_model_and_tokenizer(model_name)
     if tokenizer is None or model is None:
         st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
@@ -106,50 +75,33 @@ def main():
         st.warning("Failed to load dataset. Please check the file path and try again.")
         return
-    st.write("Tokenizing dataset...")
-    tokenized_dataset = create_dataset(data, tokenizer, max_length)
-    dataset = SimpleDataset(tokenized_dataset)
     training_args = TrainingArguments(
         output_dir='./results',
-        evaluation_strategy='no',
         learning_rate=learning_rate,
         per_device_train_batch_size=batch_size,
-        per_device_eval_batch_size=batch_size,
         num_train_epochs=num_epochs,
         weight_decay=0.01,
         logging_dir='./logs',
         logging_steps=10,
     )
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=dataset,
-        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
     )
     if st.button('Start Training'):
         st.write("Starting training...")
-        progress_bar = st.progress(0)
-        repo = Repository(local_dir="./results", clone_from=repo_id)
-        for epoch in range(int(num_epochs)):
-            trainer.train()
-            progress = (epoch + 1) / num_epochs
-            progress_bar.progress(progress)
-            model_path = f"./results/model_epoch_{epoch+1}"
-            trainer.save_model(model_path)
-            tokenizer.save_pretrained(model_path)  # Save the tokenizer
-            st.write(f"Model and tokenizer saved locally: {model_path}")
-            repo.push_to_hub(commit_message=f"Model after epoch {epoch+1}")
-            st.write(f"Model pushed to Hugging Face Hub: {repo_id}")
-        st.write("Training complete. Model is available on the Hugging Face Hub.")
 if __name__ == "__main__":
-    main()

 import os
 import json
 import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+from datasets import Dataset
 import torch
 from huggingface_hub import Repository, HfFolder
 import subprocess
+# ... (keep the authentication and git config parts)
 @st.cache_data
 def load_data(file_path):
         return None
 @st.cache_resource
+def initialize_model_and_tokenizer(model_name, num_labels):
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
     except Exception as e:
         st.error(f"Error initializing model and tokenizer: {str(e)}")
         return None, None
+def create_dataset(data, tokenizer, max_length):
+    texts = [f"Human: {item['prompt']}\nAssistant: {item['response']}" for item in data]
+    labels = [item['label'] for item in data]  # Ensure your data has 'label' field
+    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
+    dataset = Dataset.from_dict({
+        'input_ids': encodings['input_ids'],
+        'attention_mask': encodings['attention_mask'],
+        'labels': labels
+    })
+    return dataset
 def main():
     st.title("Model Training with Streamlit")
     num_epochs = st.number_input("Enter number of training epochs", min_value=1, max_value=10, value=3)
     batch_size = st.number_input("Enter batch size", min_value=1, max_value=32, value=8)
     learning_rate = st.number_input("Enter learning rate", min_value=1e-6, max_value=1e-3, value=5e-5, format="%.1e")
+    num_labels = st.number_input("Enter number of labels", min_value=2, max_value=10, value=2)
     repo_id = st.text_input("Enter Hugging Face repository ID", "nileshhanotia/PeVe")
+    tokenizer, model = initialize_model_and_tokenizer(model_name, num_labels)
     if tokenizer is None or model is None:
         st.warning("Failed to initialize model and tokenizer. Please check the model name and try again.")
         st.warning("Failed to load dataset. Please check the file path and try again.")
         return
+    st.write("Preparing dataset...")
+    dataset = create_dataset(data, tokenizer, max_length)
     training_args = TrainingArguments(
         output_dir='./results',
+        evaluation_strategy='epoch',
         learning_rate=learning_rate,
         per_device_train_batch_size=batch_size,
         num_train_epochs=num_epochs,
         weight_decay=0.01,
         logging_dir='./logs',
         logging_steps=10,
+        push_to_hub=True,
+        hub_model_id=repo_id,
     )
     trainer = Trainer(
         model=model,
         args=training_args,
         train_dataset=dataset,
     )
     if st.button('Start Training'):
         st.write("Starting training...")
+        trainer.train()
+        trainer.push_to_hub()
+        st.write(f"Training complete. Model is available on the Hugging Face Hub: {repo_id}")
 if __name__ == "__main__":
+    main()