AEGIS-SECURE-API / data_pipeline.py
Akshat Bhatt
added code
e2e0c18
import os
import glob
import pandas as pd
import config
from feature_extraction import extract_features_from_dataframe
def load_and_sample_raw_data(data_dir, fraction=0.1, random_state=42):
raw_data_files = glob.glob(os.path.join(data_dir, "*.csv"))
if not raw_data_files:
print(f"Error: No .csv files found in '{data_dir}'.")
print("Please add your raw data files (e.g., phishing.csv, legit.csv) to the /data/ folder.")
return pd.DataFrame()
print(f"Found {len(raw_data_files)} raw data files.")
all_samples = []
for file_path in raw_data_files:
try:
print(f"Loading and sampling {os.path.basename(file_path)}...")
df = pd.read_csv(file_path, on_bad_lines='skip')
if 'label' not in df.columns or 'url' not in df.columns:
print(f"Warning: Skipping {file_path}. Must contain 'label' and 'url' columns.")
continue
sample_df = df.sample(frac=fraction, random_state=random_state)
all_samples.append(sample_df)
except Exception as e:
print(f"Error processing {file_path}: {e}")
if not all_samples:
print("Error: No valid data could be loaded.")
return pd.DataFrame()
combined_df = pd.concat(all_samples, ignore_index=True)
combined_df = combined_df.sample(frac=0.1, random_state=random_state).reset_index(drop=True)
print(f"Total raw training data prepared: {len(combined_df)} samples.")
return combined_df
def main():
print("--- Starting Data Pipeline ---")
raw_df = load_and_sample_raw_data(
data_dir=config.DATA_DIR,
fraction=config.TRAIN_SAMPLE_FRACTION
)
if raw_df.empty:
print("Data pipeline failed. Exiting.")
return
engineered_df = extract_features_from_dataframe(raw_df)
engineered_df.to_csv(config.ENGINEERED_TRAIN_FILE, index=False)
print(f"\n--- Data Pipeline Complete ---")
print(f"Engineered training set saved to: {config.ENGINEERED_TRAIN_FILE}")
print(f"Total features: {len(config.ALL_FEATURE_COLUMNS)}")
if __name__ == "__main__":
os.makedirs(config.DATA_DIR, exist_ok=True)
if not glob.glob(os.path.join(config.DATA_DIR, "*.csv")):
print("Creating dummy data files...")
dummy_phish = pd.DataFrame({
'label': [1, 1],
'url': ['facebook.com.login-support.ru', 'myetherwallets.kr/wallet']
})
dummy_phish.to_csv(os.path.join(config.DATA_DIR, 'phishing_data_1.csv'), index=False)
dummy_legit = pd.DataFrame({
'label': [0, 0],
'url': ['google.com', 'https://www.millect.com/Plans']
})
dummy_legit.to_csv(os.path.join(config.DATA_DIR, 'legit_data_1.csv'), index=False)
print(f"Dummy files created in {config.DATA_DIR}. Please replace them with your real data.")
main()