| from dataclasses import dataclass |
| from typing import Optional |
|
|
| import pandas as pd |
| from datasets import Dataset |
| from sklearn.model_selection import train_test_split |
|
|
|
|
| RESERVED_COLUMNS = ["autotrain_text", "autotrain_label"] |
| LLM_RESERVED_COLUMNS = ["autotrain_prompt", "autotrain_context", "autotrain_response", "autotrain_prompt_start"] |
|
|
|
|
| @dataclass |
| class TextBinaryClassificationPreprocessor: |
| train_data: pd.DataFrame |
| text_column: str |
| label_column: str |
| username: str |
| project_name: str |
| token: str |
| valid_data: Optional[pd.DataFrame] = None |
| test_size: Optional[float] = 0.2 |
| seed: Optional[int] = 42 |
|
|
| def __post_init__(self): |
| |
| if self.text_column not in self.train_data.columns: |
| raise ValueError(f"{self.text_column} not in train data") |
| if self.label_column not in self.train_data.columns: |
| raise ValueError(f"{self.label_column} not in train data") |
| |
| if self.valid_data is not None: |
| if self.text_column not in self.valid_data.columns: |
| raise ValueError(f"{self.text_column} not in valid data") |
| if self.label_column not in self.valid_data.columns: |
| raise ValueError(f"{self.label_column} not in valid data") |
|
|
| |
| for column in RESERVED_COLUMNS: |
| if column in self.train_data.columns: |
| raise ValueError(f"{column} is a reserved column name") |
| if self.valid_data is not None: |
| if column in self.valid_data.columns: |
| raise ValueError(f"{column} is a reserved column name") |
|
|
| def split(self): |
| if self.valid_data is not None: |
| return self.train_data, self.valid_data |
| else: |
| train_df, valid_df = train_test_split( |
| self.train_data, |
| test_size=self.test_size, |
| random_state=self.seed, |
| stratify=self.train_data[self.label_column], |
| ) |
| train_df = train_df.reset_index(drop=True) |
| valid_df = valid_df.reset_index(drop=True) |
| return train_df, valid_df |
|
|
| def prepare_columns(self, train_df, valid_df): |
| train_df.loc[:, "autotrain_text"] = train_df[self.text_column] |
| train_df.loc[:, "autotrain_label"] = train_df[self.label_column] |
| valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column] |
| valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column] |
|
|
| |
| train_df = train_df.drop(columns=[self.text_column, self.label_column]) |
| valid_df = valid_df.drop(columns=[self.text_column, self.label_column]) |
| return train_df, valid_df |
|
|
| def prepare(self): |
| train_df, valid_df = self.split() |
| train_df, valid_df = self.prepare_columns(train_df, valid_df) |
| train_df = Dataset.from_pandas(train_df) |
| valid_df = Dataset.from_pandas(valid_df) |
| train_df.push_to_hub( |
| f"{self.username}/autotrain-data-{self.project_name}", |
| split="train", |
| private=True, |
| token=self.token, |
| ) |
| valid_df.push_to_hub( |
| f"{self.username}/autotrain-data-{self.project_name}", |
| split="validation", |
| private=True, |
| token=self.token, |
| ) |
| return train_df, valid_df |
|
|
|
|
| class TextMultiClassClassificationPreprocessor(TextBinaryClassificationPreprocessor): |
| pass |
|
|
|
|
| class TextSingleColumnRegressionPreprocessor(TextBinaryClassificationPreprocessor): |
| def split(self): |
| if self.valid_data is not None: |
| return self.train_data, self.valid_data |
| else: |
| train_df, valid_df = train_test_split( |
| self.train_data, |
| test_size=self.test_size, |
| random_state=self.seed, |
| ) |
| train_df = train_df.reset_index(drop=True) |
| valid_df = valid_df.reset_index(drop=True) |
| return train_df, valid_df |
|
|
|
|
| @dataclass |
| class LLMPreprocessor: |
| train_data: pd.DataFrame |
| username: str |
| project_name: str |
| token: str |
| valid_data: Optional[pd.DataFrame] = None |
| test_size: Optional[float] = 0.2 |
| seed: Optional[int] = 42 |
| context_column: Optional[str] = None |
| prompt_start_column: Optional[str] = None |
| text_column: Optional[str] = None |
| prompt_column: Optional[str] = None |
| response_column: Optional[str] = None |
|
|
| def __post_init__(self): |
| |
| if self.text_column is not None and (self.prompt_column is not None or self.response_column is not None): |
| raise ValueError("Please provide either text_column or prompt_column and response_column") |
|
|
| if self.text_column is not None: |
| |
| self.prompt_column = self.text_column |
| self.response_column = self.text_column |
|
|
| |
| if self.prompt_column not in self.train_data.columns: |
| raise ValueError(f"{self.prompt_column} not in train data") |
| if self.response_column not in self.train_data.columns: |
| raise ValueError(f"{self.response_column} not in train data") |
| |
| if self.valid_data is not None: |
| if self.prompt_column not in self.valid_data.columns: |
| raise ValueError(f"{self.prompt_column} not in valid data") |
| if self.response_column not in self.valid_data.columns: |
| raise ValueError(f"{self.response_column} not in valid data") |
|
|
| |
| for column in RESERVED_COLUMNS + LLM_RESERVED_COLUMNS: |
| if column in self.train_data.columns: |
| raise ValueError(f"{column} is a reserved column name") |
| if self.valid_data is not None: |
| if column in self.valid_data.columns: |
| raise ValueError(f"{column} is a reserved column name") |
|
|
| def split(self): |
| if self.valid_data is not None: |
| return self.train_data, self.valid_data |
| else: |
| train_df, valid_df = train_test_split( |
| self.train_data, |
| test_size=self.test_size, |
| random_state=self.seed, |
| ) |
| train_df = train_df.reset_index(drop=True) |
| valid_df = valid_df.reset_index(drop=True) |
| return train_df, valid_df |
|
|
| def prepare_columns(self, train_df, valid_df): |
| if self.text_column is not None: |
| train_df.loc[:, "autotrain_text"] = train_df[self.text_column] |
| valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column] |
|
|
| |
| train_df = train_df.drop(columns=[self.text_column]) |
| valid_df = valid_df.drop(columns=[self.text_column]) |
| return train_df, valid_df |
| else: |
| train_df.loc[:, "autotrain_prompt"] = train_df[self.prompt_column] |
| valid_df.loc[:, "autotrain_prompt"] = valid_df[self.prompt_column] |
|
|
| train_df.loc[:, "autotrain_response"] = train_df[self.response_column] |
| valid_df.loc[:, "autotrain_response"] = valid_df[self.response_column] |
|
|
| train_df = train_df.drop(columns=[self.prompt_column, self.response_column]) |
| valid_df = valid_df.drop(columns=[self.prompt_column, self.response_column]) |
|
|
| if self.context_column is not None: |
| train_df.loc[:, "autotrain_context"] = train_df[self.context_column] |
| valid_df.loc[:, "autotrain_context"] = valid_df[self.context_column] |
|
|
| train_df = train_df.drop(columns=[self.context_column]) |
| valid_df = valid_df.drop(columns=[self.context_column]) |
|
|
| if self.prompt_start_column is not None: |
| train_df.loc[:, "autotrain_prompt_start"] = train_df[self.prompt_start_column] |
| valid_df.loc[:, "autotrain_prompt_start"] = valid_df[self.prompt_start_column] |
|
|
| train_df = train_df.drop(columns=[self.prompt_start_column]) |
| valid_df = valid_df.drop(columns=[self.prompt_start_column]) |
|
|
| return train_df, valid_df |
|
|
| def prepare(self): |
| train_df, valid_df = self.split() |
| train_df, valid_df = self.prepare_columns(train_df, valid_df) |
| train_df = Dataset.from_pandas(train_df) |
| valid_df = Dataset.from_pandas(valid_df) |
| train_df.push_to_hub( |
| f"{self.username}/autotrain-data-{self.project_name}", |
| split="train", |
| private=True, |
| token=self.token, |
| ) |
| valid_df.push_to_hub( |
| f"{self.username}/autotrain-data-{self.project_name}", |
| split="validation", |
| private=True, |
| token=self.token, |
| ) |
| return train_df, valid_df |
|
|