File size: 4,107 Bytes
7500f3e
 
7442188
7500f3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
"""
Inference script for samples autoencoder model
Generated automatically during training
"""

import torch
import pandas as pd
import numpy as np
import json
import argparse
import os

def load_model_and_config(model_dir):
    """Load the trained model and its configuration"""
    config_path = os.path.join(model_dir, 'model_config.json')
    with open(config_path, 'r') as f:
        config = json.load(f)
    
    # Load model
    model_file = config['model_info']['saved_model_file']
    model_path = os.path.join(model_dir, model_file)
    
    # Reconstruct model architecture based on model type
    from compress_data_unified import SimpleAE, AE
    
    latent_dims = config['model_info']['latent_dims']
    input_dim = config['model_info']['input_dim'] 
    layer_sizes = config['model_info']['layer_sizes']
    model_type = config['model_info']['model_type']
    
    if model_type == 'SimpleAE':
        if isinstance(layer_sizes, list) and len(layer_sizes) > 1:
            # If wrapped in AE class
            model = AE(layer_sizes, use_simple=True)
        else:
            # Direct SimpleAE
            model = SimpleAE(input_dim, latent_dims)
    else:
        # Standard AE
        model = AE(layer_sizes, use_simple=False)
    
    model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()
    
    return model, config

def preprocess_data(data, config):
    """Apply same preprocessing as training"""
    # Normalize to [-1, 1] range exactly as done in training
    eps = 1e-8
    min_val = np.nanmin(data)
    max_val = np.nanmax(data)
    if max_val - min_val < eps:
        return data
    normalized = 2 * (data - min_val) / (max_val - min_val + eps) - 1
    return normalized

def run_inference(model_dir, input_data_path, output_path=None):
    """Run inference on new data"""
    model, config = load_model_and_config(model_dir)
    
    # Load and preprocess data
    data = pd.read_csv(input_data_path, index_col=0)
    data_processed = preprocess_data(data, config)
    
    # Convert to tensor
    data_tensor = torch.FloatTensor(data_processed.values)
    
    # Run inference
    with torch.no_grad():
        # Encode to latent space
        latent = model.encode(data_tensor)
        # Decode back to original space
        reconstructed = model.decode(latent)
    
    # Convert back to dataframes
    latent_df = pd.DataFrame(latent.numpy(), 
                           index=data.index,
                           columns=[f'latent_{i+1}' for i in range(config['model_info']['latent_dims'])])
    
    reconstructed_df = pd.DataFrame(reconstructed.numpy(),
                                  index=data.index, 
                                  columns=data.columns)
    
    # Save results
    if output_path is None:
        output_path = 'inference_results'
    
    os.makedirs(output_path, exist_ok=True)
    latent_df.to_csv(os.path.join(output_path, 'latent_representation.csv'))
    reconstructed_df.to_csv(os.path.join(output_path, 'reconstructed_data.csv'))
    
    print(f"Inference completed:")
    print(f"  Latent representation saved: {os.path.join(output_path, 'latent_representation.csv')}")
    print(f"  Reconstructed data saved: {os.path.join(output_path, 'reconstructed_data.csv')}")
    
    return latent_df, reconstructed_df

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Run inference with trained autoencoder')
    parser.add_argument('--model_dir', type=str, required=True,
                       help='Directory containing trained model and config')
    parser.add_argument('--input_data', type=str, required=True,
                       help='Path to input data CSV file')
    parser.add_argument('--output_dir', type=str, default='inference_results',
                       help='Output directory for results')
    
    args = parser.parse_args()
    
    latent, reconstructed = run_inference(args.model_dir, args.input_data, args.output_dir)
    print(f"Latent dimensions: {latent.shape}")
    print(f"Reconstructed dimensions: {reconstructed.shape}")