amis5895 commited on
Commit
d2bd607
ยท
1 Parent(s): fc9016a

Replace with real AutoTrain training code

Browse files
Files changed (2) hide show
  1. app.py +135 -23
  2. app_real_training.py +309 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- ๊ฐ„๋‹จํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
  """
5
 
6
  import os
@@ -33,7 +33,8 @@ training_status = {
33
  "current_epoch": 0,
34
  "total_epochs": 3,
35
  "loss": 0.0,
36
- "status": "idle"
 
37
  }
38
 
39
  class TrainingRequest(BaseModel):
@@ -64,7 +65,7 @@ async def start_training(request: TrainingRequest, background_tasks: BackgroundT
64
  })
65
 
66
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
67
- background_tasks.add_task(run_training_simple, request)
68
 
69
  return {
70
  "message": "Training started",
@@ -72,17 +73,18 @@ async def start_training(request: TrainingRequest, background_tasks: BackgroundT
72
  "model_name": request.model_name
73
  }
74
 
75
- async def run_training_simple(request: TrainingRequest):
76
- """๊ฐ„๋‹จํ•œ ํ•™์Šต ์‹คํ–‰ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)"""
77
  global training_status
78
 
79
  try:
80
- logger.info("Starting simple training process...")
81
  training_status["status"] = "running"
82
 
83
  # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
84
  train_file = Path("/app/train.csv")
85
  val_file = Path("/app/validation.csv")
 
86
 
87
  if not train_file.exists():
88
  logger.error(f"Training file not found: {train_file}")
@@ -102,24 +104,127 @@ async def run_training_simple(request: TrainingRequest):
102
  })
103
  return
104
 
105
- logger.info("Data files found, starting training simulation...")
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- # ๊ฐ„๋‹จํ•œ ํ›ˆ๋ จ ์‹œ๋ฎฌ๋ ˆ์ด์…˜
108
- for epoch in range(1, 4):
109
- training_status["current_epoch"] = epoch
110
- training_status["progress"] = (epoch / 3) * 100
111
- training_status["loss"] = 2.5 - (epoch * 0.5) # ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์†์‹ค๊ฐ’
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- logger.info(f"Epoch {epoch}/3 - Loss: {training_status['loss']:.3f}")
114
- await asyncio.sleep(5) # 5์ดˆ ๋Œ€๊ธฐ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- training_status.update({
117
- "is_running": False,
118
- "progress": 100,
119
- "status": "completed"
120
- })
121
- logger.info("Training completed successfully!")
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  except Exception as e:
124
  logger.error(f"Training error: {str(e)}")
125
  training_status.update({
@@ -127,6 +232,10 @@ async def run_training_simple(request: TrainingRequest):
127
  "status": "error",
128
  "error": str(e)
129
  })
 
 
 
 
130
 
131
  @app.get("/status")
132
  async def get_status():
@@ -136,7 +245,7 @@ async def get_status():
136
  @app.get("/logs")
137
  async def get_logs():
138
  """๋กœ๊ทธ ์กฐํšŒ"""
139
- log_file = Path("/app/training.log")
140
  if log_file.exists():
141
  with open(log_file, "r", encoding="utf-8") as f:
142
  logs = f.read()
@@ -148,7 +257,7 @@ async def get_logs():
148
  async def stream_logs():
149
  """์‹ค์‹œ๊ฐ„ ๋กœ๊ทธ ์ŠคํŠธ๋ฆฌ๋ฐ"""
150
  def generate_logs():
151
- log_file = Path("/app/training.log")
152
  if log_file.exists():
153
  with open(log_file, "r", encoding="utf-8") as f:
154
  for line in f:
@@ -183,12 +292,15 @@ async def get_data_info():
183
  """๋ฐ์ดํ„ฐ ์ •๋ณด ์กฐํšŒ"""
184
  train_file = Path("/app/train.csv")
185
  val_file = Path("/app/validation.csv")
 
186
 
187
  info = {
188
  "train_file_exists": train_file.exists(),
189
  "validation_file_exists": val_file.exists(),
 
190
  "train_file_size": train_file.stat().st_size if train_file.exists() else 0,
191
- "validation_file_size": val_file.stat().st_size if val_file.exists() else 0
 
192
  }
193
 
194
  return info
 
1
  #!/usr/bin/env python3
2
  """
3
+ ์‹ค์ œ AutoTrain์„ ์‚ฌ์šฉํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
  """
5
 
6
  import os
 
33
  "current_epoch": 0,
34
  "total_epochs": 3,
35
  "loss": 0.0,
36
+ "status": "idle",
37
+ "log_file": "/app/training.log"
38
  }
39
 
40
  class TrainingRequest(BaseModel):
 
65
  })
66
 
67
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
68
+ background_tasks.add_task(run_real_training, request)
69
 
70
  return {
71
  "message": "Training started",
 
73
  "model_name": request.model_name
74
  }
75
 
76
+ async def run_real_training(request: TrainingRequest):
77
+ """์‹ค์ œ AutoTrain์„ ์‚ฌ์šฉํ•œ ํ•™์Šต ์‹คํ–‰"""
78
  global training_status
79
 
80
  try:
81
+ logger.info("Starting real AutoTrain training process...")
82
  training_status["status"] = "running"
83
 
84
  # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
85
  train_file = Path("/app/train.csv")
86
  val_file = Path("/app/validation.csv")
87
+ config_file = Path("/app/autotrain_ultra_low_final.yaml")
88
 
89
  if not train_file.exists():
90
  logger.error(f"Training file not found: {train_file}")
 
104
  })
105
  return
106
 
107
+ if not config_file.exists():
108
+ logger.error(f"Config file not found: {config_file}")
109
+ training_status.update({
110
+ "is_running": False,
111
+ "status": "failed",
112
+ "error": "Config file not found"
113
+ })
114
+ return
115
+
116
+ logger.info("All files found, starting real AutoTrain training...")
117
+
118
+ # ๋กœ๊ทธ ํŒŒ์ผ ์ดˆ๊ธฐํ™”
119
+ log_file = Path(training_status["log_file"])
120
+ log_file.write_text("Starting AutoTrain training...\n", encoding="utf-8")
121
 
122
+ # AutoTrain ๋ช…๋ น์–ด ์‹คํ–‰
123
+ cmd = [
124
+ "autotrain", "llm",
125
+ "--train",
126
+ "--project_name", "exaone-finetuning",
127
+ "--model", "LGAI-EXAONE/EXAONE-4.0-1.2B",
128
+ "--data_path", "/app",
129
+ "--text_column", "text",
130
+ "--use_peft",
131
+ "--quantization", "int4",
132
+ "--lora_r", "16",
133
+ "--lora_alpha", "32",
134
+ "--lora_dropout", "0.05",
135
+ "--target_modules", "all-linear",
136
+ "--epochs", "3",
137
+ "--batch_size", "4",
138
+ "--gradient_accumulation", "4",
139
+ "--learning_rate", "2e-4",
140
+ "--warmup_ratio", "0.03",
141
+ "--mixed_precision", "fp16",
142
+ "--push_to_hub",
143
+ "--hub_model_id", request.model_name,
144
+ "--username", "amis5895"
145
+ ]
146
+
147
+ logger.info(f"Running command: {' '.join(cmd)}")
148
+
149
+ # ๋กœ๊ทธ ํŒŒ์ผ์— ๋ช…๋ น์–ด ๊ธฐ๋ก
150
+ with open(log_file, "a", encoding="utf-8") as f:
151
+ f.write(f"Command: {' '.join(cmd)}\n")
152
+ f.write("=" * 50 + "\n")
153
+
154
+ # AutoTrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰
155
+ process = subprocess.Popen(
156
+ cmd,
157
+ stdout=subprocess.PIPE,
158
+ stderr=subprocess.STDOUT,
159
+ text=True,
160
+ bufsize=1,
161
+ universal_newlines=True,
162
+ cwd="/app"
163
+ )
164
+
165
+ # ํ•™์Šต ์ง„ํ–‰ ์ƒํ™ฉ ๋ชจ๋‹ˆํ„ฐ๋ง
166
+ for line in process.stdout:
167
+ logger.info(line.strip())
168
 
169
+ # ๋กœ๊ทธ ํŒŒ์ผ์— ๊ธฐ๋ก
170
+ with open(log_file, "a", encoding="utf-8") as f:
171
+ f.write(line)
172
+
173
+ # ์ง„ํ–‰๋ฅ  ํŒŒ์‹ฑ
174
+ if "epoch" in line.lower() and "/" in line:
175
+ try:
176
+ # "Epoch 1/3" ํ˜•ํƒœ์—์„œ ์ง„ํ–‰๋ฅ  ์ถ”์ถœ
177
+ parts = line.split()
178
+ for i, part in enumerate(parts):
179
+ if part.lower() == "epoch" and i + 1 < len(parts):
180
+ epoch_info = parts[i + 1]
181
+ if "/" in epoch_info:
182
+ current, total = epoch_info.split("/")
183
+ training_status["current_epoch"] = int(current)
184
+ training_status["total_epochs"] = int(total)
185
+ training_status["progress"] = (int(current) / int(total)) * 100
186
+ break
187
+ except:
188
+ pass
189
+
190
+ # ์†์‹ค๊ฐ’ ํŒŒ์‹ฑ
191
+ if "loss" in line.lower():
192
+ try:
193
+ parts = line.split()
194
+ for i, part in enumerate(parts):
195
+ if part.lower() == "loss" and i + 1 < len(parts):
196
+ loss_value = float(parts[i + 1])
197
+ training_status["loss"] = loss_value
198
+ break
199
+ except:
200
+ pass
201
 
202
+ process.wait()
 
 
 
 
 
203
 
204
+ if process.returncode == 0:
205
+ training_status.update({
206
+ "is_running": False,
207
+ "progress": 100,
208
+ "status": "completed"
209
+ })
210
+ logger.info("Training completed successfully!")
211
+
212
+ # ์™„๋ฃŒ ๋กœ๊ทธ ๊ธฐ๋ก
213
+ with open(log_file, "a", encoding="utf-8") as f:
214
+ f.write("\n" + "=" * 50 + "\n")
215
+ f.write("Training completed successfully!\n")
216
+ else:
217
+ training_status.update({
218
+ "is_running": False,
219
+ "status": "failed"
220
+ })
221
+ logger.error("Training failed!")
222
+
223
+ # ์‹คํŒจ ๋กœ๊ทธ ๊ธฐ๋ก
224
+ with open(log_file, "a", encoding="utf-8") as f:
225
+ f.write("\n" + "=" * 50 + "\n")
226
+ f.write(f"Training failed with return code: {process.returncode}\n")
227
+
228
  except Exception as e:
229
  logger.error(f"Training error: {str(e)}")
230
  training_status.update({
 
232
  "status": "error",
233
  "error": str(e)
234
  })
235
+
236
+ # ์˜ค๋ฅ˜ ๋กœ๊ทธ ๊ธฐ๋ก
237
+ with open(log_file, "a", encoding="utf-8") as f:
238
+ f.write(f"\nError: {str(e)}\n")
239
 
240
  @app.get("/status")
241
  async def get_status():
 
245
  @app.get("/logs")
246
  async def get_logs():
247
  """๋กœ๊ทธ ์กฐํšŒ"""
248
+ log_file = Path(training_status["log_file"])
249
  if log_file.exists():
250
  with open(log_file, "r", encoding="utf-8") as f:
251
  logs = f.read()
 
257
  async def stream_logs():
258
  """์‹ค์‹œ๊ฐ„ ๋กœ๊ทธ ์ŠคํŠธ๋ฆฌ๋ฐ"""
259
  def generate_logs():
260
+ log_file = Path(training_status["log_file"])
261
  if log_file.exists():
262
  with open(log_file, "r", encoding="utf-8") as f:
263
  for line in f:
 
292
  """๋ฐ์ดํ„ฐ ์ •๋ณด ์กฐํšŒ"""
293
  train_file = Path("/app/train.csv")
294
  val_file = Path("/app/validation.csv")
295
+ config_file = Path("/app/autotrain_ultra_low_final.yaml")
296
 
297
  info = {
298
  "train_file_exists": train_file.exists(),
299
  "validation_file_exists": val_file.exists(),
300
+ "config_file_exists": config_file.exists(),
301
  "train_file_size": train_file.stat().st_size if train_file.exists() else 0,
302
+ "validation_file_size": val_file.stat().st_size if val_file.exists() else 0,
303
+ "config_file_size": config_file.stat().st_size if config_file.exists() else 0
304
  }
305
 
306
  return info
app_real_training.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ์‹ค์ œ AutoTrain์„ ์‚ฌ์šฉํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import subprocess
9
+ import asyncio
10
+ from pathlib import Path
11
+ from typing import Dict, Any
12
+ import logging
13
+
14
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
15
+ from fastapi.responses import StreamingResponse
16
+ from pydantic import BaseModel
17
+ import uvicorn
18
+
19
+ # ๋กœ๊น… ์„ค์ •
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ app = FastAPI(
24
+ title="EXAONE Fine-tuning",
25
+ description="EXAONE 4.0 1.2B ๋ชจ๋ธ ํŒŒ์ธํŠœ๋‹ API",
26
+ version="1.0.0"
27
+ )
28
+
29
+ # ์ „์—ญ ๋ณ€์ˆ˜
30
+ training_status = {
31
+ "is_running": False,
32
+ "progress": 0,
33
+ "current_epoch": 0,
34
+ "total_epochs": 3,
35
+ "loss": 0.0,
36
+ "status": "idle",
37
+ "log_file": "/app/training.log"
38
+ }
39
+
40
+ class TrainingRequest(BaseModel):
41
+ model_name: str = "amis5895/exaone-1p2b-nutrition-kdri"
42
+
43
+ @app.get("/")
44
+ async def root():
45
+ """๋ฃจํŠธ ์—”๋“œํฌ์ธํŠธ"""
46
+ return {
47
+ "message": "EXAONE Fine-tuning API",
48
+ "status": "running",
49
+ "version": "1.0.0"
50
+ }
51
+
52
+ @app.post("/start_training")
53
+ async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks):
54
+ """ํ•™์Šต ์‹œ์ž‘"""
55
+ global training_status
56
+
57
+ if training_status["is_running"]:
58
+ raise HTTPException(status_code=400, detail="Training is already running")
59
+
60
+ training_status.update({
61
+ "is_running": True,
62
+ "progress": 0,
63
+ "current_epoch": 0,
64
+ "status": "starting"
65
+ })
66
+
67
+ # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
68
+ background_tasks.add_task(run_real_training, request)
69
+
70
+ return {
71
+ "message": "Training started",
72
+ "status": "starting",
73
+ "model_name": request.model_name
74
+ }
75
+
76
+ async def run_real_training(request: TrainingRequest):
77
+ """์‹ค์ œ AutoTrain์„ ์‚ฌ์šฉํ•œ ํ•™์Šต ์‹คํ–‰"""
78
+ global training_status
79
+
80
+ try:
81
+ logger.info("Starting real AutoTrain training process...")
82
+ training_status["status"] = "running"
83
+
84
+ # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
85
+ train_file = Path("/app/train.csv")
86
+ val_file = Path("/app/validation.csv")
87
+ config_file = Path("/app/autotrain_ultra_low_final.yaml")
88
+
89
+ if not train_file.exists():
90
+ logger.error(f"Training file not found: {train_file}")
91
+ training_status.update({
92
+ "is_running": False,
93
+ "status": "failed",
94
+ "error": "Training file not found"
95
+ })
96
+ return
97
+
98
+ if not val_file.exists():
99
+ logger.error(f"Validation file not found: {val_file}")
100
+ training_status.update({
101
+ "is_running": False,
102
+ "status": "failed",
103
+ "error": "Validation file not found"
104
+ })
105
+ return
106
+
107
+ if not config_file.exists():
108
+ logger.error(f"Config file not found: {config_file}")
109
+ training_status.update({
110
+ "is_running": False,
111
+ "status": "failed",
112
+ "error": "Config file not found"
113
+ })
114
+ return
115
+
116
+ logger.info("All files found, starting real AutoTrain training...")
117
+
118
+ # ๋กœ๊ทธ ํŒŒ์ผ ์ดˆ๊ธฐํ™”
119
+ log_file = Path(training_status["log_file"])
120
+ log_file.write_text("Starting AutoTrain training...\n", encoding="utf-8")
121
+
122
+ # AutoTrain ๋ช…๋ น์–ด ์‹คํ–‰
123
+ cmd = [
124
+ "autotrain", "llm",
125
+ "--train",
126
+ "--project_name", "exaone-finetuning",
127
+ "--model", "LGAI-EXAONE/EXAONE-4.0-1.2B",
128
+ "--data_path", "/app",
129
+ "--text_column", "text",
130
+ "--use_peft",
131
+ "--quantization", "int4",
132
+ "--lora_r", "16",
133
+ "--lora_alpha", "32",
134
+ "--lora_dropout", "0.05",
135
+ "--target_modules", "all-linear",
136
+ "--epochs", "3",
137
+ "--batch_size", "4",
138
+ "--gradient_accumulation", "4",
139
+ "--learning_rate", "2e-4",
140
+ "--warmup_ratio", "0.03",
141
+ "--mixed_precision", "fp16",
142
+ "--push_to_hub",
143
+ "--hub_model_id", request.model_name,
144
+ "--username", "amis5895"
145
+ ]
146
+
147
+ logger.info(f"Running command: {' '.join(cmd)}")
148
+
149
+ # ๋กœ๊ทธ ํŒŒ์ผ์— ๋ช…๋ น์–ด ๊ธฐ๋ก
150
+ with open(log_file, "a", encoding="utf-8") as f:
151
+ f.write(f"Command: {' '.join(cmd)}\n")
152
+ f.write("=" * 50 + "\n")
153
+
154
+ # AutoTrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰
155
+ process = subprocess.Popen(
156
+ cmd,
157
+ stdout=subprocess.PIPE,
158
+ stderr=subprocess.STDOUT,
159
+ text=True,
160
+ bufsize=1,
161
+ universal_newlines=True,
162
+ cwd="/app"
163
+ )
164
+
165
+ # ํ•™์Šต ์ง„ํ–‰ ์ƒํ™ฉ ๋ชจ๋‹ˆํ„ฐ๋ง
166
+ for line in process.stdout:
167
+ logger.info(line.strip())
168
+
169
+ # ๋กœ๊ทธ ํŒŒ์ผ์— ๊ธฐ๋ก
170
+ with open(log_file, "a", encoding="utf-8") as f:
171
+ f.write(line)
172
+
173
+ # ์ง„ํ–‰๋ฅ  ํŒŒ์‹ฑ
174
+ if "epoch" in line.lower() and "/" in line:
175
+ try:
176
+ # "Epoch 1/3" ํ˜•ํƒœ์—์„œ ์ง„ํ–‰๋ฅ  ์ถ”์ถœ
177
+ parts = line.split()
178
+ for i, part in enumerate(parts):
179
+ if part.lower() == "epoch" and i + 1 < len(parts):
180
+ epoch_info = parts[i + 1]
181
+ if "/" in epoch_info:
182
+ current, total = epoch_info.split("/")
183
+ training_status["current_epoch"] = int(current)
184
+ training_status["total_epochs"] = int(total)
185
+ training_status["progress"] = (int(current) / int(total)) * 100
186
+ break
187
+ except:
188
+ pass
189
+
190
+ # ์†์‹ค๊ฐ’ ํŒŒ์‹ฑ
191
+ if "loss" in line.lower():
192
+ try:
193
+ parts = line.split()
194
+ for i, part in enumerate(parts):
195
+ if part.lower() == "loss" and i + 1 < len(parts):
196
+ loss_value = float(parts[i + 1])
197
+ training_status["loss"] = loss_value
198
+ break
199
+ except:
200
+ pass
201
+
202
+ process.wait()
203
+
204
+ if process.returncode == 0:
205
+ training_status.update({
206
+ "is_running": False,
207
+ "progress": 100,
208
+ "status": "completed"
209
+ })
210
+ logger.info("Training completed successfully!")
211
+
212
+ # ์™„๋ฃŒ ๋กœ๊ทธ ๊ธฐ๋ก
213
+ with open(log_file, "a", encoding="utf-8") as f:
214
+ f.write("\n" + "=" * 50 + "\n")
215
+ f.write("Training completed successfully!\n")
216
+ else:
217
+ training_status.update({
218
+ "is_running": False,
219
+ "status": "failed"
220
+ })
221
+ logger.error("Training failed!")
222
+
223
+ # ์‹คํŒจ ๋กœ๊ทธ ๊ธฐ๋ก
224
+ with open(log_file, "a", encoding="utf-8") as f:
225
+ f.write("\n" + "=" * 50 + "\n")
226
+ f.write(f"Training failed with return code: {process.returncode}\n")
227
+
228
+ except Exception as e:
229
+ logger.error(f"Training error: {str(e)}")
230
+ training_status.update({
231
+ "is_running": False,
232
+ "status": "error",
233
+ "error": str(e)
234
+ })
235
+
236
+ # ์˜ค๋ฅ˜ ๋กœ๊ทธ ๊ธฐ๋ก
237
+ with open(log_file, "a", encoding="utf-8") as f:
238
+ f.write(f"\nError: {str(e)}\n")
239
+
240
+ @app.get("/status")
241
+ async def get_status():
242
+ """ํ•™์Šต ์ƒํƒœ ์กฐํšŒ"""
243
+ return training_status
244
+
245
+ @app.get("/logs")
246
+ async def get_logs():
247
+ """๋กœ๊ทธ ์กฐํšŒ"""
248
+ log_file = Path(training_status["log_file"])
249
+ if log_file.exists():
250
+ with open(log_file, "r", encoding="utf-8") as f:
251
+ logs = f.read()
252
+ return {"logs": logs}
253
+ else:
254
+ return {"logs": "No logs available"}
255
+
256
+ @app.get("/logs/stream")
257
+ async def stream_logs():
258
+ """์‹ค์‹œ๊ฐ„ ๋กœ๊ทธ ์ŠคํŠธ๋ฆฌ๋ฐ"""
259
+ def generate_logs():
260
+ log_file = Path(training_status["log_file"])
261
+ if log_file.exists():
262
+ with open(log_file, "r", encoding="utf-8") as f:
263
+ for line in f:
264
+ yield f"data: {line}\\n\\n"
265
+ else:
266
+ yield "data: No logs available\\n\\n"
267
+
268
+ return StreamingResponse(generate_logs(), media_type="text/plain")
269
+
270
+ @app.post("/stop_training")
271
+ async def stop_training():
272
+ """ํ•™์Šต ์ค‘์ง€"""
273
+ global training_status
274
+
275
+ if not training_status["is_running"]:
276
+ raise HTTPException(status_code=400, detail="No training is running")
277
+
278
+ training_status.update({
279
+ "is_running": False,
280
+ "status": "stopped"
281
+ })
282
+
283
+ return {"message": "Training stopped"}
284
+
285
+ @app.get("/health")
286
+ async def health_check():
287
+ """ํ—ฌ์Šค ์ฒดํฌ"""
288
+ return {"status": "healthy", "timestamp": "2024-01-01T00:00:00Z"}
289
+
290
+ @app.get("/data_info")
291
+ async def get_data_info():
292
+ """๋ฐ์ดํ„ฐ ์ •๋ณด ์กฐํšŒ"""
293
+ train_file = Path("/app/train.csv")
294
+ val_file = Path("/app/validation.csv")
295
+ config_file = Path("/app/autotrain_ultra_low_final.yaml")
296
+
297
+ info = {
298
+ "train_file_exists": train_file.exists(),
299
+ "validation_file_exists": val_file.exists(),
300
+ "config_file_exists": config_file.exists(),
301
+ "train_file_size": train_file.stat().st_size if train_file.exists() else 0,
302
+ "validation_file_size": val_file.stat().st_size if val_file.exists() else 0,
303
+ "config_file_size": config_file.stat().st_size if config_file.exists() else 0
304
+ }
305
+
306
+ return info
307
+
308
+ if __name__ == "__main__":
309
+ uvicorn.run(app, host="0.0.0.0", port=7860)