File size: 20,023 Bytes
cc910a7 2d18777 cc910a7 2d18777 c678ee1 cc910a7 c745fee cc910a7 2d18777 c745fee c678ee1 c745fee c678ee1 c745fee 2d18777 cc910a7 c745fee cc910a7 2d18777 cc910a7 c745fee c678ee1 cc910a7 2d18777 cc910a7 c745fee 2d18777 c745fee 2d18777 cc910a7 2d18777 c745fee 2d18777 cc910a7 c745fee cc910a7 2d18777 c678ee1 c745fee c678ee1 c745fee c678ee1 c745fee c678ee1 c745fee 2d18777 cc910a7 c678ee1 2d18777 cc910a7 c678ee1 cc910a7 c678ee1 cc910a7 c678ee1 2d18777 c745fee cc910a7 2d18777 c745fee c678ee1 2d18777 cc910a7 2d18777 cc910a7 2d18777 c678ee1 c745fee c678ee1 c745fee c678ee1 c745fee 2d18777 c678ee1 2d18777 c678ee1 cc910a7 c678ee1 2d18777 c678ee1 2d18777 c678ee1 cc910a7 c678ee1 cc910a7 c678ee1 cc910a7 c678ee1 cc910a7 2d18777 c678ee1 c745fee cc910a7 2d18777 c678ee1 cc910a7 2d18777 cc910a7 c745fee c678ee1 cc910a7 2d18777 cc910a7 c745fee cc910a7 2d18777 cc910a7 c745fee cc910a7 c678ee1 cc910a7 2d18777 c745fee cc910a7 2d18777 c745fee c678ee1 c745fee cc910a7 2d18777 c678ee1 c745fee cc910a7 2d18777 cc910a7 c745fee cc910a7 2d18777 c678ee1 cc910a7 c678ee1 2d18777 c678ee1 cc910a7 c678ee1 cc910a7 2d18777 c678ee1 cc910a7 2d18777 cc910a7 c678ee1 cc910a7 c745fee cc910a7 c745fee cc910a7 c745fee cc910a7 2d18777 c678ee1 cc910a7 c678ee1 cc910a7 c745fee 2d18777 c678ee1 cc910a7 2d18777 cc910a7 c678ee1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 |
import os
import sys
import shutil
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
def log_step(message):
"""Log initialization steps"""
print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
def check_model_exists():
"""Check if trained model already exists"""
model_files = [
Path("/tmp/pipeline.pkl"),
Path("/tmp/model.pkl"),
Path("/tmp/vectorizer.pkl"),
Path("/tmp/metadata.json")
]
existing_files = [f for f in model_files if f.exists()]
if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
log_step(f"β
Found {len(existing_files)} existing model files")
return True, existing_files
else:
log_step(f"β Missing model files - only found {len(existing_files)}")
return False, existing_files
def check_training_data_exists():
"""Check if training data is available"""
data_files = [
Path("/tmp/data/combined_dataset.csv"),
Path("/app/data/combined_dataset.csv"),
Path("/tmp/data/kaggle/Fake.csv"),
Path("/tmp/data/kaggle/True.csv")
]
existing_data = [f for f in data_files if f.exists()]
if existing_data:
log_step(f"β
Found training data: {[str(f) for f in existing_data]}")
return True, existing_data
else:
log_step("β No training data found")
return False, []
def create_directories():
"""Create necessary directories"""
log_step("Creating directory structure...")
directories = [
"/tmp/data",
"/tmp/data/kaggle",
"/tmp/model",
"/tmp/logs",
"/tmp/results",
"/tmp/backups"
]
for dir_path in directories:
Path(dir_path).mkdir(parents=True, exist_ok=True)
log_step(f"β
Created {dir_path}")
def copy_original_datasets():
"""Copy original datasets from /app to /tmp"""
log_step("Copying original datasets...")
source_files = [
("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
]
copied_count = 0
for source, dest in source_files:
if Path(source).exists():
Path(dest).parent.mkdir(parents=True, exist_ok=True)
shutil.copy(source, dest)
log_step(f"β
Copied {source} to {dest}")
copied_count += 1
else:
log_step(f"β οΈ Source file not found: {source}")
return copied_count > 0
def create_minimal_dataset():
"""Create a minimal dataset if original doesn't exist"""
log_step("Creating minimal dataset...")
combined_path = Path("/tmp/data/combined_dataset.csv")
if combined_path.exists():
log_step("β
Combined dataset already exists")
return True
# Create minimal training data with more samples for better training
minimal_data = pd.DataFrame({
'text': [
# Real news samples
'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
'Local authorities report significant improvements in air quality following new environmental regulations',
'Research published in Nature journal shows promising results for renewable energy storage technology',
'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
'Transportation department announces infrastructure improvements to major highways across the region',
'Educational institutions implement new digital learning platforms to enhance student engagement',
'Agricultural studies reveal improved crop yields through sustainable farming practices',
'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
'Municipal government approves budget for public transportation expansion project in urban areas',
'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
'International trade agreements show positive impact on local businesses and job creation',
'Environmental protection agency releases report on water quality improvements in major rivers',
# Fake news samples
'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
'WARNING: New technology allows complete thought reading through WiFi signals in your home',
'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
'UNCOVERED: All news media controlled by single person living in secret underground bunker',
'PROOF: Time travel already exists but only available to wealthy elite who control world events',
'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
],
'label': [
# Real news labels (0)
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# Fake news labels (1)
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
]
})
minimal_data.to_csv(combined_path, index=False)
log_step(f"β
Created enhanced minimal dataset with {len(minimal_data)} samples")
log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
return True
def run_initial_training():
"""Run comprehensive model training for first-time setup"""
log_step("π Starting comprehensive model training for first-time setup...")
try:
# Import training modules
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
import re
# Text preprocessing function (same as in train.py)
def preprocess_text_function(texts):
def clean_single_text(text):
text = str(text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'[!]{2,}', '!', text)
text = re.sub(r'[?]{2,}', '?', text)
text = re.sub(r'[.]{3,}', '...', text)
text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip().lower()
processed = []
for text in texts:
processed.append(clean_single_text(text))
return processed
# Load dataset
dataset_path = Path("/tmp/data/combined_dataset.csv")
if not dataset_path.exists():
log_step("β No dataset available for training")
return False
df = pd.read_csv(dataset_path)
log_step(f"π Loaded dataset with {len(df)} samples")
# Data validation and cleaning
df = df.dropna(subset=['text', 'label'])
df = df[df['text'].astype(str).str.len() > 10]
log_step(f"π After cleaning: {len(df)} samples")
log_step(f"π Class distribution: {df['label'].value_counts().to_dict()}")
# Prepare data
X = df['text'].values
y = df['label'].values
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
log_step(f"π Data split: {len(X_train)} train, {len(X_test)} test")
# Create comprehensive pipeline
text_preprocessor = FunctionTransformer(
func=preprocess_text_function,
validate=False
)
vectorizer = TfidfVectorizer(
max_features=5000,
min_df=1,
max_df=0.95,
ngram_range=(1, 2),
stop_words='english',
sublinear_tf=True,
norm='l2'
)
feature_selector = SelectKBest(
score_func=chi2,
k=2000
)
# Create pipeline with Logistic Regression
pipeline = Pipeline([
('preprocess', text_preprocessor),
('vectorize', vectorizer),
('feature_select', feature_selector),
('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
])
log_step("π§ Training model with optimized pipeline...")
# Hyperparameter tuning for datasets with sufficient samples
if len(X_train) >= 20:
log_step("βοΈ Performing hyperparameter tuning...")
param_grid = {
'model__C': [0.1, 1, 10],
'model__penalty': ['l2']
}
cv_folds = max(2, min(3, len(X_train) // 10))
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
scoring='f1_weighted',
n_jobs=1
)
grid_search.fit(X_train, y_train)
best_pipeline = grid_search.best_estimator_
log_step(f"β
Best parameters: {grid_search.best_params_}")
log_step(f"β
Best CV score: {grid_search.best_score_:.4f}")
else:
log_step("βοΈ Using simple training for small dataset...")
pipeline.fit(X_train, y_train)
best_pipeline = pipeline
# Evaluate model
y_pred = best_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
log_step(f"π Model Performance:")
log_step(f" - Accuracy: {accuracy:.4f}")
log_step(f" - F1 Score: {f1:.4f}")
# Save model artifacts
log_step("πΎ Saving model artifacts...")
# Save the complete pipeline
joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
log_step("β
Saved complete pipeline")
# Save individual components for compatibility
joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
log_step("β
Saved individual model components")
# Generate comprehensive metadata
metadata = {
"model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
"model_type": "logistic_regression",
"training_method": "initial_setup",
"dataset_size": len(df),
"train_size": len(X_train),
"test_size": len(X_test),
"test_accuracy": float(accuracy),
"test_f1": float(f1),
"hyperparameter_tuning": len(X_train) >= 20,
"cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
"class_distribution": df['label'].value_counts().to_dict(),
"training_config": {
"max_features": 5000,
"ngram_range": [1, 2],
"feature_selection_k": 2000,
"test_size": 0.2
},
"timestamp": datetime.now().isoformat(),
"initialization_notes": "Model trained during system initialization",
"ready_for_production": True
}
# Save metadata
with open("/tmp/metadata.json", 'w') as f:
json.dump(metadata, f, indent=2)
log_step("β
Saved comprehensive metadata")
log_step(f"π Initial model training completed successfully!")
log_step(f"π Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
return True
except Exception as e:
log_step(f"β Training failed: {str(e)}")
import traceback
log_step(f"π Error details: {traceback.format_exc()}")
return False
def create_initial_logs():
"""Create initial log files"""
log_step("Creating initial log files...")
try:
# Activity log
activity_log = [{
"timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
"event": "System initialized successfully with trained model",
"level": "INFO"
}]
with open("/tmp/activity_log.json", 'w') as f:
json.dump(activity_log, f, indent=2)
# Create empty monitoring logs
log_dirs = ["/tmp/logs"]
for log_dir in log_dirs:
Path(log_dir).mkdir(parents=True, exist_ok=True)
with open("/tmp/logs/monitoring_log.json", 'w') as f:
json.dump([], f)
with open("/tmp/logs/scheduler_execution.json", 'w') as f:
json.dump([], f)
log_step("β
Initial log files created")
return True
except Exception as e:
log_step(f"β Log creation failed: {str(e)}")
return False
def validate_installation():
"""Validate that the system is properly set up"""
log_step("π Validating system installation...")
validation_checks = []
# Check model files
model_exists, model_files = check_model_exists()
validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
# Check data files
data_exists, data_files = check_training_data_exists()
validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
# Check directories
required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
dirs_exist = all(Path(d).exists() for d in required_dirs)
validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
# Check logs
log_exists = Path("/tmp/activity_log.json").exists()
validation_checks.append(("Log Files", log_exists, "Activity log created"))
# Test model loading
model_loadable = False
try:
import joblib
pipeline = joblib.load("/tmp/pipeline.pkl")
test_prediction = pipeline.predict(["This is a test news article"])
model_loadable = True
validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
except Exception as e:
validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
# Print validation results
log_step("π Validation Results:")
all_passed = True
for check_name, passed, details in validation_checks:
status = "β
PASS" if passed else "β FAIL"
log_step(f" {status} {check_name}: {details}")
if not passed:
all_passed = False
return all_passed, validation_checks
def main():
"""Main initialization function with smart training logic"""
log_step("π Starting intelligent system initialization...")
# Check if model already exists
model_exists, existing_model_files = check_model_exists()
if model_exists:
log_step("π― EXISTING INSTALLATION DETECTED")
log_step("π Found existing model files - skipping training")
# Load existing metadata to show info
try:
with open("/tmp/metadata.json", 'r') as f:
metadata = json.load(f)
log_step(f"π Existing Model Info:")
log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
except Exception as e:
log_step(f"β οΈ Could not read existing metadata: {e}")
else:
log_step("π FIRST-TIME INSTALLATION DETECTED")
log_step("π§ No existing model found - will train new model")
# Run initialization steps
steps = [
("Directory Creation", create_directories),
("Dataset Copy", copy_original_datasets),
("Dataset Preparation", create_minimal_dataset),
("Log Creation", create_initial_logs)
]
# Add training step only if model doesn't exist
if not model_exists:
steps.insert(-1, ("π€ Model Training", run_initial_training))
failed_steps = []
for step_name, step_function in steps:
try:
log_step(f"βΆοΈ Starting: {step_name}")
if step_function():
log_step(f"β
{step_name} completed")
else:
log_step(f"β {step_name} failed")
failed_steps.append(step_name)
except Exception as e:
log_step(f"β {step_name} failed: {str(e)}")
failed_steps.append(step_name)
# Final validation
log_step("π Running final system validation...")
validation_passed, validation_results = validate_installation()
# Summary
log_step("=" * 60)
if failed_steps:
log_step(f"β οΈ Initialization completed with {len(failed_steps)} issues")
log_step(f"β Failed steps: {', '.join(failed_steps)}")
else:
log_step("π System initialization completed successfully!")
if validation_passed:
log_step("β
All validation checks passed!")
log_step("π System is ready for use!")
if not model_exists:
log_step("π€ NEW MODEL TRAINED AND READY")
log_step("π You can now start making predictions!")
else:
log_step("π EXISTING MODEL VALIDATED AND READY")
log_step("π System restored from previous installation!")
else:
log_step("β Some validation checks failed")
log_step("π§ Manual intervention may be required")
log_step("=" * 60)
if __name__ == "__main__":
main() |