Spaces:
Sleeping
Sleeping
File size: 1,784 Bytes
1853d90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import pandas as pd
import numpy as np
import json
from llm_config import generate_llm_response
from llm_prompts import DETERMINE_DTYPE_PROMPT
SAMPLE_SIZE = 200
def determine_column_type(df, column):
sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist()
prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample))
response = generate_llm_response(prompt)
try:
result = json.loads(response)
return result['column_type'], result['invalid_indices']
except (json.JSONDecodeError, KeyError):
print(f"Error parsing LLM response for column {column}")
return 'string', []
def enforce_column_type(df, column, column_type, invalid_indices):
if column_type == 'float':
df[column] = pd.to_numeric(df[column], errors='coerce')
elif column_type == 'integer':
df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
elif column_type == 'date':
df[column] = pd.to_datetime(df[column], errors='coerce')
# Set invalid values to NaN
df.loc[invalid_indices, column] = np.nan
return df
def process_dataframe(df):
print("Determining and enforcing column data types...")
for column in df.columns:
print(f"\nProcessing column: {column}")
column_type, invalid_indices = determine_column_type(df, column)
print(f" Detected type: {column_type}")
print(f" Number of invalid values: {len(invalid_indices)}")
df = enforce_column_type(df, column, column_type, invalid_indices)
valid_percentage = (df[column].count() / len(df)) * 100
print(f" Percentage of valid values after type enforcement: {valid_percentage:.2f}%")
return df
|