import pandas as pd from sklearn.model_selection import train_test_split # Load the Excel file into a DataFrame df = pd.read_excel('data\data_excel.xlsx') n_rows = df.shape[0] df = df.iloc[:, [2, 4]] print(df) # Filter out rows where col2 is 'hello' or 'hi' df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])] print(df) # Assume df is the original DataFrame you want to split train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9, random_state=42) # Remove the white space from the columns train_df = train_df.apply(lambda x: x.str.strip()) test_df = test_df.apply(lambda x: x.str.strip()) print(train_df) print(test_df) # Concatenate the columns with a whitespace separator train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply( lambda x: ':' + str(x)) test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x)) # Set the display options for left alignment pd.options.display.max_colwidth = None pd.options.display.colheader_justify = 'left' # Print and write the DataFrames to text files with open('data\output.txt', 'w', encoding='utf-8') as f: output = train_df.to_string(index=False, header=False).replace(' ', '') # output = output.replace(':', '\t') f.write(output) f.write('\n') with open('data\output2.txt', 'w', encoding='utf-8') as f: output = test_df.to_string(index=False, header=False).replace(' ', '') # output = output.replace(':', '\t') f.write(output) f.write('\n')