hoodiexxx
/

Bert_Chinese_Text_Classification_Model

Text Classification

Model card Files Files and versions Community

Bert_Chinese_Text_Classification_Model / xlsx2txt.py

hoodiexxx's picture

Upload 7 files

2c1d053 verified 9 months ago

1.55 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split

	# Load the Excel file into a DataFrame
	df = pd.read_excel('data\data_excel.xlsx')
	n_rows = df.shape[0]
	df = df.iloc[:, [2, 4]]
	print(df)
	# Filter out rows where col2 is 'hello' or 'hi'
	df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])]
	print(df)
	# Assume df is the original DataFrame you want to split
	train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9,
	random_state=42)

	# Remove the white space from the columns
	train_df = train_df.apply(lambda x: x.str.strip())
	test_df = test_df.apply(lambda x: x.str.strip())
	print(train_df)
	print(test_df)
	# Concatenate the columns with a whitespace separator
	train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply(
	lambda x: ':' + str(x))
	test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x))

	# Set the display options for left alignment
	pd.options.display.max_colwidth = None
	pd.options.display.colheader_justify = 'left'

	# Print and write the DataFrames to text files
	with open('data\output.txt', 'w', encoding='utf-8') as f:
	output = train_df.to_string(index=False, header=False).replace(' ', '')
	# output = output.replace(':', '\t')
	f.write(output)
	f.write('\n')

	with open('data\output2.txt', 'w', encoding='utf-8') as f:
	output = test_df.to_string(index=False, header=False).replace(' ', '')
	# output = output.replace(':', '\t')
	f.write(output)
	f.write('\n')