import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Excel file into a DataFrame
df = pd.read_excel('data\data_excel.xlsx')
n_rows = df.shape[0]
df = df.iloc[:, [2, 4]]
print(df)
# Filter out rows where col2 is 'hello' or 'hi'
df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])]
print(df)
# Assume df is the original DataFrame you want to split
train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9,
                                     random_state=42)

# Remove the white space from the columns
train_df = train_df.apply(lambda x: x.str.strip())
test_df = test_df.apply(lambda x: x.str.strip())
print(train_df)
print(test_df)
# Concatenate the columns with a whitespace separator
train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply(
    lambda x: ':' + str(x))
test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x))

# Set the display options for left alignment
pd.options.display.max_colwidth = None
pd.options.display.colheader_justify = 'left'

# Print and write the DataFrames to text files
with open('data\output.txt', 'w', encoding='utf-8') as f:
    output = train_df.to_string(index=False, header=False).replace(' ', '')
    # output = output.replace(':', '\t')
    f.write(output)
    f.write('\n')

with open('data\output2.txt', 'w', encoding='utf-8') as f:
    output = test_df.to_string(index=False, header=False).replace(' ', '')
    # output = output.replace(':', '\t')
    f.write(output)
    f.write('\n')