|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
df = pd.read_excel('data\data_excel.xlsx') |
|
n_rows = df.shape[0] |
|
df = df.iloc[:, [2, 4]] |
|
print(df) |
|
|
|
df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])] |
|
print(df) |
|
|
|
train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9, |
|
random_state=42) |
|
|
|
|
|
train_df = train_df.apply(lambda x: x.str.strip()) |
|
test_df = test_df.apply(lambda x: x.str.strip()) |
|
print(train_df) |
|
print(test_df) |
|
|
|
train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply( |
|
lambda x: ':' + str(x)) |
|
test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x)) |
|
|
|
|
|
pd.options.display.max_colwidth = None |
|
pd.options.display.colheader_justify = 'left' |
|
|
|
|
|
with open('data\output.txt', 'w', encoding='utf-8') as f: |
|
output = train_df.to_string(index=False, header=False).replace(' ', '') |
|
|
|
f.write(output) |
|
f.write('\n') |
|
|
|
with open('data\output2.txt', 'w', encoding='utf-8') as f: |
|
output = test_df.to_string(index=False, header=False).replace(' ', '') |
|
|
|
f.write(output) |
|
f.write('\n') |
|
|