ai-assistant / config /process_text.py
digitalai's picture
Upload 29 files
6736fcd verified
raw
history blame
No virus
4.87 kB
import pandas as pd
from pathlib import Path
class DataManager:
"""
A class to manage data processing tasks.
Attributes:
db_file (str): The filename of the main CSV database.
"""
def __init__(self):
"""
Initialize DataManager object with the default database files.
"""
self.db_file = Path("data").joinpath("database.csv")
self.db_level = Path("data").joinpath("db_level")
def get_id(self, level):
"""
Find the question corresponding to the given level value.
Args:
level (str): The level value to search for.
Returns:
str: The corresponding question text if found, otherwise None.
"""
df_level = pd.read_csv(self.db_level)
question_text = df_level.loc[df_level['level'] == level, 'question'].values
return question_text[0] if len(question_text) > 0 else None
@staticmethod
def cleaner(texts):
"""
Clean text data by stripping unnecessary characters.
Args:
texts (str): Text data separated by '|'.
Returns:
list of str: Cleaned text data.
"""
return [txt.strip('"\n') for txt in texts.split("|")]
@staticmethod
def make_id(level_id, num, next_id):
"""
Generate IDs based on a root ID, level ID, and a number.
Args:
level_id (str): The level ID.
num (int): The number of IDs to generate.
next_id (list of str): The next IDs.
Returns:
list of str: Generated IDs.
"""
if level_id != "root":
return [f"L{level_id}/{next_id[i]}" for i in range(num)]
else:
return [f"L{next_id[i]}" for i in range(num)]
def make_row(
self, level, question_text, options, feedback, next_id
):
"""
Generate rows based on input data.
Args:
level (str): ID of the question.
question_text (str): Text of the question.
options (str): Text of the options.
feedback (str): Text of the actions.
next_id (str): IDs of the next questions.
Returns:
list of list: Generated rows.
"""
next_list = self.cleaner(next_id)
action_list = self.cleaner(feedback)
option_list = self.cleaner(options)
list_id = self.make_id(level, len(option_list), next_list,)
return [
[list_id[i], question_text, option_list[i], action_list[i], next_list[i]]
for i in range(len(list_id))
]
def _create_dataframe(
self, level, question_text, options, feedback, next_id
):
"""
Create a DataFrame from input rows.
Args:
level (str): ID of the question.
question_text (str): Text of the question.
options (str): Texts of the options.
feedback (str): Text of the actions.
next_id (str): IDs of the next questions.
Returns:
pd.DataFrame: Constructed DataFrame.
"""
rows = self.make_row(
level, question_text,
options, feedback, next_id
)
return pd.DataFrame(
columns=["level", "question_text", "options", "feedback", "next"],
data=rows,
)
@staticmethod
def _clean_dataframe(df_unclean):
"""
Clean the DataFrame by removing duplicates and NaN values.
Args:
df_unclean (pd.DataFrame): The unclean DataFrame.
Returns:
pd.DataFrame: The cleaned DataFrame.
"""
return df_unclean.drop_duplicates().dropna()
def save_to_database(
self, level, question_text, options, feedback, next_id
):
"""
Save the DataFrame to a CSV file after cleaning and combining with existing data.
Args:
level (str): ID of the question.
question_text (str): Text of the question.
options (str): Text of the options.
feedback (str): Text of the actions.
next_id (str): IDs of the next questions.
"""
df_input = self._create_dataframe(
level,
question_text,
options,
feedback,
next_id
)
df_database = self.read_db()
df_combined = pd.concat([df_database, df_input], ignore_index=True)
df_cleaned = self._clean_dataframe(df_combined)
df_cleaned.to_csv(self.db_file, index=False)
def read_db(self):
"""
Read data from a CSV file and return it as a DataFrame.
Returns:
pd.DataFrame: The DataFrame read from the CSV file.
"""
return pd.read_csv(self.db_file)