File size: 4,033 Bytes
23e9906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import requests
import json
from pathlib import Path
import time

# --- Configuration ---
DATASET_PATH = Path("combined_emails_with_natural_pii.csv") # Path to your input CSV
OUTPUT_PATH = Path("api_output_results.jsonl") # Where to save the results (JSON Lines format)
API_ENDPOINT = "http://127.0.0.1:8000/classify/" # Your running API endpoint

# Adjust column names if different
email_body_column = 'email'
# category_column = 'type' # Original category column (optional, for reference)

# --- Main Function ---
def process_emails_via_api(data_path: Path, output_path: Path, api_url: str):
    """
    Reads emails from a CSV, sends them to the classification API,
    and saves the JSON responses to a file.
    """
    if not data_path.exists():
        print(f"Error: Input dataset not found at {data_path}")
        return

    print(f"Loading dataset from {data_path}...")
    try:
        # Skip bad lines just in case, consistent with training
        df = pd.read_csv(data_path, on_bad_lines='skip')
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    if email_body_column not in df.columns:
        print(f"Error: Email body column '{email_body_column}' not found.")
        return

    # Handle potential missing email bodies
    df.dropna(subset=[email_body_column], inplace=True)
    if df.empty:
        print("Error: No valid email bodies found after handling missing values.")
        return

    results = []
    total_emails = len(df)
    print(f"Processing {total_emails} emails via API: {api_url}")

    # Open output file in write mode (clears existing content)
    with open(output_path, 'w') as f_out:
        for index, row in df.iterrows():
            email_text = str(row[email_body_column]) # Ensure it's a string
            payload = {"email_body": email_text}

            try:
                response = requests.post(api_url, json=payload, timeout=30) # Added timeout
                response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

                api_result = response.json()

                # Write result as a JSON line
                f_out.write(json.dumps(api_result) + '\n')

                if (index + 1) % 50 == 0: # Print progress every 50 emails
                    print(f"Processed {index + 1}/{total_emails} emails...")

            except requests.exceptions.RequestException as e:
                print(f"\nError processing email index {index}: {e}")
                # Optionally write error info to the file or a separate log
                error_info = {
                    "error": str(e),
                    "input_email_body": email_text,
                    "index": index
                }
                f_out.write(json.dumps(error_info) + '\n')
                # Optional: add a small delay if API is overloaded
                # time.sleep(0.5)
            except json.JSONDecodeError as e:
                 print(f"\nError decoding JSON response for email index {index}: {e}")
                 print(f"Response status code: {response.status_code}")
                 print(f"Response text: {response.text[:500]}...") # Print beginning of text
                 error_info = {
                    "error": f"JSONDecodeError: {e}",
                    "response_text": response.text,
                    "input_email_body": email_text,
                    "index": index
                 }
                 f_out.write(json.dumps(error_info) + '\n')


    print(f"\nProcessing complete. Results saved to {output_path}")


# --- Script Execution ---
if __name__ == "__main__":
    # Make sure the API is running before executing this script!
    print("--- Starting API Output Generation ---")
    print("Ensure the FastAPI server (python app.py or uvicorn) is running in another terminal.")
    input("Press Enter to continue once the API is running...")
    process_emails_via_api(DATASET_PATH, OUTPUT_PATH, API_ENDPOINT)
    print("--- Finished API Output Generation ---")