Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import json | |
from llm_config import generate_llm_response | |
from llm_prompts import DETERMINE_DTYPE_PROMPT | |
SAMPLE_SIZE = 200 | |
def determine_column_type(df, column): | |
sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist() | |
prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample)) | |
response = generate_llm_response(prompt) | |
try: | |
result = json.loads(response) | |
return result['column_type'], result['invalid_indices'] | |
except (json.JSONDecodeError, KeyError): | |
print(f"Error parsing LLM response for column {column}") | |
return 'string', [] | |
def enforce_column_type(df, column, column_type, invalid_indices): | |
if column_type == 'float': | |
df[column] = pd.to_numeric(df[column], errors='coerce') | |
elif column_type == 'integer': | |
df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64') | |
elif column_type == 'date': | |
df[column] = pd.to_datetime(df[column], errors='coerce') | |
# Set invalid values to NaN | |
df.loc[invalid_indices, column] = np.nan | |
return df | |
def process_dataframe(df): | |
print("Determining and enforcing column data types...") | |
for column in df.columns: | |
print(f"\nProcessing column: {column}") | |
column_type, invalid_indices = determine_column_type(df, column) | |
print(f" Detected type: {column_type}") | |
print(f" Number of invalid values: {len(invalid_indices)}") | |
df = enforce_column_type(df, column, column_type, invalid_indices) | |
valid_percentage = (df[column].count() / len(df)) * 100 | |
print(f" Percentage of valid values after type enforcement: {valid_percentage:.2f}%") | |
return df | |