File size: 2,433 Bytes
14fa8ce
3c479cd
14fa8ce
38a88ab
 
3c479cd
7b87f4d
 
 
 
 
 
 
 
14fa8ce
 
 
 
 
7b87f4d
14fa8ce
3c479cd
38a88ab
14fa8ce
38a88ab
 
14fa8ce
3c479cd
38a88ab
3c479cd
38a88ab
 
 
 
14fa8ce
38a88ab
14fa8ce
3c479cd
38a88ab
14fa8ce
38a88ab
14fa8ce
3c479cd
38a88ab
3c479cd
 
 
 
38a88ab
 
3c479cd
 
38a88ab
 
7b87f4d
 
14fa8ce
3c479cd
 
38a88ab
3c479cd
 
 
38a88ab
 
3c479cd
 
 
38a88ab
14fa8ce
3c479cd
14fa8ce
3c479cd
38a88ab
14fa8ce
3c479cd
14fa8ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import torch
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

# ✅ Set a writable cache directory inside the container
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"

# Ensure cache directory exists
os.makedirs("/app/hf_cache", exist_ok=True)

# Set verbose logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Loading dataset...")
ds = load_dataset("facebook/natural_reasoning")  # Replace with your dataset
logger.info(f"Dataset loaded successfully! Dataset info:\n{ds}")

# Load tokenizer
logger.info("Loading tokenizer...")
model_name = "deepseek-ai/DeepSeek-R1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
logger.info("Tokenizer loaded successfully!")

# Tokenization function
def preprocess_function(examples):
    input_texts = [f"Q: {q} A: {a}" for q, a in zip(examples["question"], examples["reference_answer"])]
    return tokenizer(input_texts, truncation=True, padding="max_length", max_length=512)

# Tokenize dataset
logger.info("Tokenizing dataset...")
tokenized_datasets = ds.map(preprocess_function, batched=True)
logger.info("Dataset tokenized successfully!")

# Load model
logger.info("Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
logger.info("Model loaded successfully!")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,  # Adjust based on available RAM
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=True,
    report_to="none",
    logging_first_step=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

# Start training
logger.info("Starting training...")
trainer.train()
logger.info("Training completed!")

# Push trained model to Hugging Face Hub
logger.info("Pushing trained model to Hugging Face Hub...")
trainer.push_to_hub()
logger.info("Model push completed! Training process finished successfully.")