Keeby-smilyai commited on
Commit
7bad00a
Β·
verified Β·
1 Parent(s): c750cf8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
3
+ from datasets import load_dataset
4
+ import torch
5
+ import os
6
+
7
+ #-------------------------------Functions----------------------------------------------#
8
+ def load_and_preprocess_data(dataset_name, tokenizer):
9
+ try:
10
+ dataset = load_dataset(dataset_name, split="train")
11
+ except Exception as e:
12
+ return None, f"Error loading dataset: {e}"
13
+
14
+ def tokenize_function(examples):
15
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
16
+
17
+ try:
18
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
19
+ except Exception as e:
20
+ return None, f"Error tokenizing dataset: {e}"
21
+
22
+ return tokenized_datasets, None
23
+ #---------------------------------------------------------------------------------------#
24
+
25
+ def train_model(architecture_size, api_key, repo_name, push_to_hub):
26
+ # Map architecture size to model name
27
+ model_name_mapping = {
28
+ "Small": "distilgpt2",
29
+ "Medium": "gpt2",
30
+ "Large": "gpt2-medium",
31
+ }
32
+ model_name = model_name_mapping[architecture_size]
33
+
34
+ # Device setup
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ device_msg = "CUDA is available! Training will be faster on GPU." if torch.cuda.is_available() else "CUDA not available. Training on CPU will be slow."
37
+
38
+ # Validate push_to_hub inputs
39
+ if push_to_hub:
40
+ if not api_key or not api_key.strip():
41
+ return "❌ Error: You must provide a Hugging Face API key if pushing to hub is selected."
42
+ if not repo_name or not repo_name.strip():
43
+ return "❌ Error: You must provide a repository name if pushing to hub is selected."
44
+
45
+ try:
46
+ # Load dataset
47
+ dataset_name = "wikitext-2-raw-v1"
48
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
49
+ if tokenizer.pad_token is None:
50
+ tokenizer.pad_token = tokenizer.eos_token
51
+
52
+ tokenized_datasets, error_msg = load_and_preprocess_data(dataset_name, tokenizer)
53
+ if error_msg:
54
+ return f"❌ {error_msg}"
55
+ if tokenized_datasets is None:
56
+ return "❌ Failed to load and preprocess dataset."
57
+
58
+ # Load model
59
+ model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
60
+ model.resize_token_embeddings(len(tokenizer))
61
+
62
+ # Training args
63
+ output_dir = "./results"
64
+ training_args = TrainingArguments(
65
+ output_dir=output_dir,
66
+ num_train_epochs=1,
67
+ per_device_train_batch_size=4,
68
+ save_steps=500,
69
+ save_total_limit=1,
70
+ logging_steps=250,
71
+ learning_rate=5e-5,
72
+ weight_decay=0.01,
73
+ push_to_hub=push_to_hub,
74
+ hub_model_id=repo_name if push_to_hub else None,
75
+ hub_token=api_key if push_to_hub else None,
76
+ fp16=torch.cuda.is_available(),
77
+ )
78
+
79
+ # Data collator
80
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
81
+
82
+ # Trainer
83
+ trainer = Trainer(
84
+ model=model,
85
+ args=training_args,
86
+ train_dataset=tokenized_datasets,
87
+ data_collator=data_collator,
88
+ )
89
+
90
+ # Train
91
+ trainer.train()
92
+
93
+ # Save locally
94
+ trainer.save_model(output_dir)
95
+
96
+ # Evaluate
97
+ eval_results = trainer.evaluate()
98
+ eval_loss = eval_results.get('eval_loss', 'N/A')
99
+
100
+ # Push to hub if selected
101
+ if push_to_hub:
102
+ trainer.push_to_hub()
103
+ hub_msg = f"βœ… Model pushed to Hugging Face Hub: {repo_name}"
104
+ else:
105
+ hub_msg = "ℹ️ Model saved locally at ./results (not pushed to hub)."
106
+
107
+ return f"""βœ… Training Complete!
108
+ - Device: {device_msg}
109
+ - Eval Loss: {eval_loss}
110
+ - {hub_msg}
111
+ """
112
+
113
+ except Exception as e:
114
+ return f"❌ Training Error: {str(e)}"
115
+
116
+ # ----------------------------- Gradio Interface ----------------------------- #
117
+
118
+ with gr.Blocks(title="LLM Builder - Gradio") as demo:
119
+ gr.Markdown("# πŸ€– LLM Builder")
120
+ gr.Markdown("### 1. Select Model Architecture")
121
+
122
+ architecture_size = gr.Dropdown(
123
+ choices=["Small", "Medium", "Large"],
124
+ value="Small",
125
+ label="Choose Model Size",
126
+ info="Select the size of the model. Larger models have more parameters."
127
+ )
128
+
129
+ gr.Markdown("### 2. Training Setup")
130
+
131
+ with gr.Row():
132
+ with gr.Column():
133
+ api_key = gr.Textbox(
134
+ label="Hugging Face Hub API Key",
135
+ type="password",
136
+ placeholder="hf_...",
137
+ info="Required only if pushing to hub."
138
+ )
139
+ repo_name = gr.Textbox(
140
+ label="Repository Name",
141
+ placeholder="your-username/your-model-name",
142
+ info="Required only if pushing to hub."
143
+ )
144
+ push_to_hub = gr.Checkbox(
145
+ label="Push to Hugging Face Hub?",
146
+ value=False
147
+ )
148
+
149
+ train_btn = gr.Button("πŸš€ Start Training", variant="primary")
150
+ output = gr.Textbox(label="Training Output", placeholder="Training logs and results will appear here...", lines=10)
151
+
152
+ train_btn.click(
153
+ fn=train_model,
154
+ inputs=[architecture_size, api_key, repo_name, push_to_hub],
155
+ outputs=output
156
+ )
157
+
158
+ # Launch the app
159
+ if __name__ == "__main__":
160
+ demo.launch()