Upload 7 files

Browse files

Files changed (7) hide show

get_wordlists.py +97 -0
model.bin +3 -0
multihead_attention.py +255 -0
sen2inds.py +151 -0
textCNN_data.py +53 -0
train2.py +118 -0
xlsx2txt.py +41 -0

get_wordlists.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# -*- coding: utf-8 -*-
+'''
+将训练数据使用jieba分词工具进行分词。并且剔除stopList中的词。
+得到词表：
+        词表的每一行的内容为：词 词的序号 词的频次
+'''
+import json
+import jieba
+from tqdm import tqdm
+trainFile = 'data/output.txt'  # trainFile = 'data/train.txt'
+devFile = 'data/output2.txt' #"data/dev.txt"
+stopwordFile = 'data/stopword.txt'
+wordLabelFile = 'wordLabel.txt'
+lengthFile = 'length.txt'
+def read_stopword(file):
+    datas = open(file, 'r', encoding='utf_8').readlines()
+    datas = [data.replace('\n', '') for data in datas]
+    return datas
+def main():
+    worddict = {}
+    stoplist = read_stopword(stopwordFile)
+    len_dic = {}
+    data_num = 0
+    # trainFile
+    datas = open(trainFile, 'r', encoding='utf_8').readlines()
+    data_num += len(datas)
+    datas = list(filter(None, datas))
+    for line in tqdm(datas, desc='traindata word to label'):
+        line = line.replace('\n', '').split(':')
+        # line = line.replace('\n', '').split('\t')
+        title_seg = [i for i in line[0]]
+        # title_seg = jieba.cut(line[0], cut_all=False)
+        length = 0
+        for w in title_seg:
+            if w in stoplist:
+                continue
+            length += 1
+            if w in worddict:
+                worddict[w] += 1
+            else:
+                worddict[w] = 1
+        if length in len_dic:
+            len_dic[length] += 1
+        else:
+            len_dic[length] = 1
+    # devFile
+    datas = open(devFile, 'r', encoding='utf_8').readlines()
+    datas = list(filter(None, datas))
+    data_num += len(datas)
+    for line in tqdm(datas, desc='devdata word to label'):
+        line = line.replace('\n', '').split(':')
+        # line = line.replace('\n', '').split('\t')
+        title_seg = [i for i in line[0]]
+        # title_seg = jieba.cut(line[0], cut_all=False)
+        length = 0
+        for w in title_seg:
+            if w in stoplist:
+                continue
+            length += 1
+            if w in worddict:
+                worddict[w] += 1
+            else:
+                worddict[w] = 1
+        if length in len_dic:
+            len_dic[length] += 1
+        else:
+            len_dic[length] = 1
+    wordlist = sorted(worddict.items(), key=lambda item: item[1], reverse=True)
+    f = open(wordLabelFile, 'w', encoding='utf_8')
+    # ind = 0
+    ind = 1
+    for t in wordlist:
+        d = t[0] + ' ' + str(ind) + ' ' + str(t[1]) + '\n'
+        ind += 1
+        f.write(d)
+    for k, v in len_dic.items():
+        len_dic[k] = round(v * 1.0 / data_num, 3)  # calculate the probability of each length and round it to 3 decimal places
+    len_list = sorted(len_dic.items(), key=lambda item: item[0], reverse=True)
+    f = open(lengthFile, 'w', encoding='utf_8')
+    for t in len_list:
+        d = str(t[0]) + ' ' + str(t[1]) + '\n'
+        f.write(d)
+# lengthFile = 'length.txt'
+# wordLabelFile = 'wordLabel.txt'
+if __name__ == "__main__":
+    main()

model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a35cbd5939ffd2a3bdda50a86d758e45b6d819811ff693b1c8d8c6ada85fc34c
+size 6843938

multihead_attention.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from textCNN_data import textCNN_param
+class Attention1(nn.Module):
+    """
+    1.输入 [N,T,C] -> Linear、Tanh
+    2. -> [N,T,1] -> unsqueeze
+    3. -> [N,T] -> Softmax
+    4. -> [N,T] -> unsqueeze
+    5. -> [N,1,T] -> repeat
+    6. -> [N,C,T] -> transpose
+    7. -> [N,T,C]
+    """
+    def __init__(self, hidden_dim):
+        super(Attention1, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.dense = nn.Linear(hidden_dim, 1)
+    def forward(self, features):
+        batch_size, time_step, hidden_dim = 128, 20, 128  # features.size()
+        weight = nn.Tanh()(self.dense(features)).squeeze(-1)
+        # mask给负无穷使得权重为0
+        mask_idx = torch.sign(torch.abs(features).sum(dim=-1))
+        paddings = torch.ones_like(mask_idx) * (-2 ** 32 + 1)
+        weight = torch.where(torch.eq(mask_idx, 1), weight, paddings)
+        weight = nn.Softmax(dim=1)(weight)
+        weight = weight.unsqueeze(1)
+        weight = weight.repeat(1, hidden_dim, 1)
+        weight = weight.transpose(2, 1)
+        features_attention = weight * features
+        return features_attention
+class Attention2(nn.Module):
+    """
+    1.输入 [N,T,C] -> Linear、Tanh
+    2. -> [N,T,C] -> transpose
+    3. -> [N,C,T] -> Softmax
+    4. -> [N,C,T] -> mean
+    5. -> [N,T] -> unsqueeze
+    5. -> [N,1,T] -> expand
+    6. -> [N,C,T] -> transpose
+    7. -> [N,T,C]
+    """
+    def __init__(self, hidden_dim):
+        super(Attention2, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.dense = nn.Linear(hidden_dim, hidden_dim)
+    def forward(self, features, mean=True):
+        batch_size, time_step, hidden_dim = features.size()
+        weight = nn.Tanh()(self.dense(features))
+        # mask给负无穷使得权重为0
+        mask_idx = torch.sign(torch.abs(features).sum(dim=-1))
+        mask_idx = mask_idx.unsqueeze(-1).expand(batch_size, time_step,
+                                                 hidden_dim)
+        paddings = torch.ones_like(mask_idx) * (-2 ** 32 + 1)
+        weight = torch.where(torch.eq(mask_idx, 1), weight, paddings)
+        weight = weight.transpose(2, 1)
+        weight = nn.Softmax(dim=2)(weight)
+        if mean:
+            weight = weight.mean(dim=1)
+            weight = weight.unsqueeze(1)
+            weight = weight.repeat(1, hidden_dim, 1)
+        weight = weight.transpose(2, 1)
+        features_attention = weight * features
+        return features_attention
+class LayerNorm(nn.Module):
+    """
+    结果和nn.LayerNorm有些出入。
+    """
+    def __init__(self, features, epsilon=1e-8):
+        super(LayerNorm, self).__init__()
+        self.beta = nn.Parameter(torch.zeros(features))
+        self.gamma = nn.Parameter(torch.ones(features))
+        self.epsilon = epsilon
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        normalized = (x - mean) / (std + self.epsilon)
+        outputs = self.gamma * normalized + self.beta
+        return outputs
+class Multihead_Attention(nn.Module):
+    """
+    multihead_attention
+    根据<https://www.github.com/kyubyong/transformer>修改
+    1.split+cat
+    2.matmul(q,k)
+    3.mask k
+    4.softmax
+    5.mask q
+    6.matmul(attn,v)
+    7.split+cat
+    8.res q
+    9.norm
+    """
+    def __init__(self,
+                 hidden_dim,
+                 C_q=None,
+                 C_k=None,
+                 C_v=None,
+                 num_heads=1,
+                 dropout_rate=0.0):
+        super(Multihead_Attention, self).__init__()
+        self.hidden_dim = hidden_dim
+        C_q = C_q if C_q else hidden_dim
+        C_k = C_k if C_k else hidden_dim
+        C_v = C_v if C_v else hidden_dim
+        self.linear_Q = nn.Linear(C_q, hidden_dim)  # W_Q
+        self.linear_K = nn.Linear(C_k, hidden_dim)  # W_K
+        self.linear_V = nn.Linear(C_v, hidden_dim)  # W_V
+        self.num_heads = num_heads
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward(self,
+                Q, K, V):
+        """
+        :param Q: A 3d tensor with shape of [N, T_q, C_q]
+        :param K: A 3d tensor with shape of [N, T_k, C_k]
+        :param V: A 3d tensor with shape of [N, T_v, C_v]
+        :return:
+        """
+        num_heads = self.num_heads
+        N = Q.size()[0]
+        # Linear projections
+        Q_l = nn.ReLU()(self.linear_Q(Q))  # W_Q x input_Q(x)
+        K_l = nn.ReLU()(self.linear_K(K))  # W_K x input_K(x)
+        V_l = nn.ReLU()(self.linear_V(V))  # W_V x input_V(x)
+        # Split and concat
+        Q_split = Q_l.split(split_size=self.hidden_dim // num_heads, dim=2)
+        K_split = K_l.split(split_size=self.hidden_dim // num_heads, dim=2)
+        V_split = V_l.split(split_size=self.hidden_dim // num_heads, dim=2)
+        Q_ = torch.cat(Q_split, dim=0)  # (h*N, T_q, C/h)
+        K_ = torch.cat(K_split, dim=0)  # (h*N, T_k, C/h)
+        V_ = torch.cat(V_split, dim=0)  # (h*N, T_v, C/h)
+        # Multiplication
+        outputs = torch.bmm(Q_, K_.transpose(2, 1))  # Q x K^T score
+        # Scale
+        outputs = outputs / (K_.size()[
+                                 -1] ** 0.5)  # divide by the squared root of dimension K
+        # Key Masking
+        key_masks = torch.sign(torch.abs(K).sum(dim=-1))  # (N, T_k)
+        key_masks = key_masks.repeat(num_heads, 1)  # (h*N, T_k)
+        key_masks = key_masks.unsqueeze(1).repeat(1, Q.size()[1],
+                                                  1)  # (h*N, T_q, T_k)
+        paddings = torch.ones_like(key_masks) * (-2 ** 32 + 1)
+        outputs = torch.where(torch.eq(key_masks, 0), paddings,
+                              outputs)  # (h*N, T_q, T_k)
+        # Activation
+        outputs = nn.Softmax(dim=2)(
+            outputs)  # (h*N, T_q, T_k) Output is the score, activate function softmax it to probability
+        # Query Masking
+        query_masks = torch.sign(torch.abs(Q).sum(dim=-1))  # (N, T_q)
+        query_masks = query_masks.repeat(num_heads, 1)  # (h*N, T_q)
+        query_masks = query_masks.unsqueeze(-1).repeat(1, 1, K.size()[
+            1])  # (h*N, T_q, T_k)
+        outputs = outputs * query_masks  # broadcasting. (h*N, T_q, T_k)
+        # Dropouts
+        outputs = self.dropout(outputs)
+        # Weighted sum
+        outputs = torch.bmm(outputs,
+                            V_)  # ( h*N, T_q, C/h) multiply the V by scores(output)
+        # Restore shape
+        outputs = outputs.split(N, dim=0)  # (N, T_q, C)
+        outputs = torch.cat(outputs, dim=2)
+        # Residual connection
+        outputs = outputs + Q_l
+        # Normalize
+        outputs = self.norm(outputs)  # (N, T_q, C)
+        return outputs
+class my_model(nn.Module):
+    def __init__(self):
+        super(my_model, self).__init__()
+        self.my_embed = nn.Embedding(textCNN_param['vocab_size'],
+                                     textCNN_param['embed_dim'], padding_idx=1)
+        self.my_linear = nn.Linear(256, 5)  # 转化后过softmax便代表每个label类别概率
+        # self.my_linear = nn.Linear(256, 4)
+        self.dropout = nn.Dropout(0.1)
+        self.layers = nn.ModuleList(
+            [Multihead_Attention(hidden_dim=textCNN_param['embed_dim'],
+                                 num_heads=1,
+                                 dropout_rate=0.1) for _ in range(6)])
+    def forward(self, sentences):
+        # sentences = sentences.long()
+        # sentences.to('cuda:0')
+        sentences = self.my_embed(sentences)
+        for layer in self.layers:
+            sentences = layer(sentences, sentences,
+                              sentences)  # sentence 64x20x128
+        model_output = torch.mean(sentences, dim=1)  # 64x128
+        model_output = self.dropout(model_output)
+        model_output = self.my_linear(model_output)  # 64x4
+        model_output = F.log_softmax(model_output, dim=1)
+        # model_output = self.dropout(model_output)
+        return model_output
+if __name__ == '__main__':
+    features = torch.arange(0, 24)
+    features = torch.where(features < 20, features,
+                           torch.zeros_like(features))
+    features = features.view([2, 3, 4]).float()
+    print(features)
+    print(features.size())
+    attention1 = Attention1(hidden_dim=features.size()[-1])
+    print(attention1(features))
+    print('size is', attention1(features).size()[-1])
+    attention2 = Attention2(hidden_dim=features.size()[-1])
+    print(attention2(features))
+    attention3 = Multihead_Attention(hidden_dim=features.size()[-1],
+                                     num_heads=2,
+                                     dropout_rate=0.0)
+    print(attention3(features, features, features))

sen2inds.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# -*- coding: utf_8 -*-
+from tqdm import tqdm
+import jieba
+import random
+trainFile = 'data/output.txt'  # trainFile = 'data/train.txt'
+trainDataVecFile = 'traindata_vec.txt'
+devFile = 'data/output2.txt'  # 'data/dev.txt'
+devDataVecFile = 'devdata_vec.txt'
+labelFile = 'data/label2.txt'  # labelFile = 'data/label.txt'
+stopwordFile = 'data/stopword.txt'
+wordLabelFile = 'wordLabel.txt'
+maxLen = 20
+title_ind = [1, 2, 3, 4]
+title_ind.extend([0] * 16)
+def read_labelFile(file):
+    data = open(file, 'r', encoding='utf_8').read().split('\n')
+    data.remove('')
+    label_w2n = {}
+    label_n2w = {}
+    for line in tqdm(data, desc='read label'):
+        line = line.split(' ')
+        name_w = line[0]
+        name_n = int(line[1])
+        label_w2n[name_w] = name_n
+        label_n2w[name_n] = name_w
+    return label_w2n, label_n2w
+def read_stopword(file):
+    data = open(file, 'r', encoding='utf_8').read().split('\n')
+    return data
+def get_worddict(file):
+    datas = open(file, 'r', encoding='utf_8').read().split('\n')
+    datas = list(filter(None, datas))
+    word2ind = {}
+    for line in tqdm(datas, desc="get_worddict"):
+        line = line.split(' ')
+        word2ind[line[0]] = int(line[1])
+    ind2word = {word2ind[w]: w for w in word2ind}
+    return word2ind, ind2word
+def json2txt():
+    label_dict, label_n2w = read_labelFile(labelFile)
+    word2ind, ind2word = get_worddict(wordLabelFile)
+    stoplist = read_stopword(stopwordFile)
+    cla_dict = {}
+    # train data to vec
+    traindataTxt = open(trainDataVecFile, 'w')
+    datas = open(trainFile, 'r', encoding='utf_8').readlines()
+    datas = list(filter(None, datas))
+    random.shuffle(datas)
+    for line in tqdm(datas, desc="traindata to vec"):
+        line = line.replace('\n', '').split(':')
+        # line = line.replace('\n','').split('\t')
+        cla = line[1]
+        # if cla in [21, 13, 9, 24, 23, 19, 14]:
+        #     continue
+        if cla in cla_dict:
+            cla_dict[cla] += 1
+        else:
+            cla_dict[cla] = 1
+        cla_ind = label_dict[cla]
+        title_seg = ['我', '要', '下', '单']
+        title_seg = [i for i in line[0]]
+        # title_seg = jieba.cut(line[0], cut_all=False)
+        title_ind = [cla_ind]
+        for w in title_seg:
+            if w in stoplist:
+                continue
+            title_ind.append(word2ind[w])
+        length = len(title_ind)
+        if length > maxLen + 1:
+            title_ind = title_ind[0:21]
+        if length < maxLen + 1:
+            title_ind.extend([0] * (maxLen - length + 1))
+        for n in title_ind:
+            traindataTxt.write(str(n) + ',')
+        traindataTxt.write('\n')
+    # dev data to vec
+    traindataTxt = open(devDataVecFile, 'w')
+    datas = open(devFile, 'r', encoding='utf_8').readlines()
+    datas = list(filter(None, datas))
+    random.shuffle(datas)
+    for line in tqdm(datas, desc="dev to vec"):
+        line = line.replace('\n', '').split(':')
+        # line = line.replace('\n', '').split('\t')
+        cla = line[1]
+        # if cla in [21, 13, 9, 24, 23, 19, 14]:
+        #     continue
+        if cla in cla_dict:
+            cla_dict[cla] += 1
+        else:
+            cla_dict[cla] = 1
+        cla_ind = label_dict[cla]
+        title_seg = [i for i in line[0]]
+        # title_seg = jieba.cut(line[0], cut_all=False)
+        title_ind = [cla_ind]
+        for w in title_seg:
+            if w in stoplist:
+                continue
+            title_ind.append(word2ind[w])
+        length = len(title_ind)
+        if length > maxLen + 1:
+            title_ind = title_ind[0:21]
+        if length < maxLen + 1:
+            title_ind.extend([0] * (maxLen - length + 1))
+        for n in title_ind:
+            traindataTxt.write(str(n) + ',')
+        traindataTxt.write('\n')
+    cla_list = sorted(cla_dict.items(), key=lambda item: item[0], reverse=True)
+    f = open('cla_length.txt', 'w', encoding='utf_8')
+    total = 0
+    for t in cla_list:
+        a = str(t[0])
+        d = str(t[0]) + ' ' + str(label_dict[a]) + ' ' + str(t[1]) + '\n'
+        total += t[1]
+        f.write(d)
+    f.write('total: ' + str(total))
+# traindata_vec.txt
+# devdata_vec.txt
+def main():
+    json2txt()
+if __name__ == "__main__":
+    main()

textCNN_data.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from torch.utils.data import Dataset
+import random
+import numpy as np
+from tqdm import tqdm
+import torch
+import sen2inds
+class textCNN_data(Dataset):
+    def __init__(self, trainDataFile):
+        trainData = open(trainDataFile, 'r').read().split('\n')
+        trainData = list(filter(None, trainData))
+        res = []
+        for data in tqdm(trainData, desc='index to tensor'):
+            data = list(filter(None, data.split(',')))
+            data = [int(x) for x in data]
+            cla = torch.tensor(data[0], dtype=torch.long)
+            sentence = torch.tensor(data[1:], dtype=torch.long)
+            temp = []
+            temp.append(cla)
+            temp.append(sentence)
+            res.append(temp)
+        self.trainData = res
+    def __len__(self):
+        return len(self.trainData)
+    def __getitem__(self, idx):
+        data = self.trainData[idx]
+        cla = data[0]
+        sentence = data[1]
+        return cla, sentence
+word2ind, ind2word = sen2inds.get_worddict('wordLabel.txt')
+label_w2n, label_n2w = sen2inds.read_labelFile('data/label2.txt') # sen2inds.read_labelFile('data/label2.txt')
+textCNN_param = {
+    'vocab_size': len(word2ind) + 1, # plus one for 0 padding
+    'embed_dim': 256,  # 1 x 128 vector
+    'class_num': len(label_w2n),
+    "kernel_num": 16,
+    "kernel_size": [3, 4, 5],
+    "dropout": 0.5,
+}
+dataLoader_param = {
+    'batch_size': 128,
+    'shuffle': True,
+}

train2.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+from textCNN_data import textCNN_data, textCNN_param, dataLoader_param
+from torch.utils.data import DataLoader
+from multihead_attention import my_model
+import os
+from torch.nn import functional as F
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+def validation(model, val_dataLoader, device):
+    model.eval()
+    total = 0
+    correct = 0
+    with torch.no_grad():
+        for i, (clas, sentences) in enumerate(val_dataLoader):
+            try:
+                # sentences = sentences.type(torch.LongTensor).to(device)
+                # clas = clas.type(torch.LongTensor).to(device)
+                out = model(
+                    sentences.to(
+                        device))  # out: batch size 64 x sentences length 20 x word dimension 4(after my_linear)
+                # out = F.relu(out.squeeze(-3))
+                # out = F.max_pool1d(out, out.size(2)).squeeze(2)
+                # softmax = nn.Softmax(dim=1)
+                pred = torch.argmax(out, dim=1)  # 64x4 -> 64x1
+                correct += (pred == clas.to(device)).sum()
+                total += clas.size()[0]
+            except IndexError as e:
+                print(i)
+                print('clas', clas)
+                print('clas size', clas.size())
+                print('sentence', sentences)
+                print('sentences size', sentences.size())
+                print(e)
+                print(e.__traceback__)
+                exit()
+    acc = correct / total
+    return acc
+# seed = 66666666
+# torch.cuda.manual_seed(seed)
+# torch.manual_seed(seed)
+# torch.backends.cudnn.deterministic = True
+# torch.backends.cudnn.benchmark = False
+print(torch.cuda.get_device_name())
+if torch.cuda.is_available():
+    device = 'cuda:0'
+else:
+    device = 'cpu'
+# device = 'cpu'
+# init dataset
+print('init dataset...')
+trainDataFile = 'traindata_vec.txt'
+valDataFile = 'devdata_vec.txt'
+train_dataset = textCNN_data(trainDataFile)
+train_dataLoader = DataLoader(train_dataset,
+                              batch_size=dataLoader_param['batch_size'],
+                              shuffle=True)
+val_dataset = textCNN_data(valDataFile)
+val_dataLoader = DataLoader(val_dataset,
+                            batch_size=dataLoader_param['batch_size'],
+                            # batch size 64
+                            shuffle=False)
+if __name__ == "__main__":
+    # 设置随机种子，保证结果可复现
+    # init net
+    print('init net...')
+    model = my_model()
+    model.to(device)
+    print(model)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
+    criterion = nn.CrossEntropyLoss()
+    print("training...")
+    best_dev_acc = 0
+    # embed.train()
+    for epoch in range(100):
+        model.train()
+        for i, (clas, sentences) in enumerate(train_dataLoader):
+            # sentences: batch size 64 x sentence length 20 x embed dimension 128
+            # 一个字是个128维vector 一句话是个 20x128的2D tensor 一个batch有64句话是个 64x20x128的3D tensor
+            out = model(sentences.to(
+                device))  # out: batch size 64 x word vector 4 (after my_linear)
+            try:
+                loss = criterion(out, clas.to(device))
+            except:
+                print(out.size(), out)
+                print(clas.size(), clas)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            if (i + 1) % 10 == 0:
+                print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
+        model.eval()
+        dev_acc = validation(model=model, val_dataLoader=val_dataLoader,
+                             device=device)
+        if best_dev_acc < dev_acc:
+            best_dev_acc = dev_acc
+            print("save model...")
+            torch.save(model.state_dict(), "model.bin")
+            print("epoch:", epoch + 1, "step:", i + 1, "loss:", loss.item())
+        print("best dev acc %.4f  dev acc %.4f" % (best_dev_acc, dev_acc))

xlsx2txt.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+from sklearn.model_selection import train_test_split
+# Load the Excel file into a DataFrame
+df = pd.read_excel('data\data_excel.xlsx')
+n_rows = df.shape[0]
+df = df.iloc[:, [2, 4]]
+print(df)
+# Filter out rows where col2 is 'hello' or 'hi'
+df = df[df['人工标注'].isin(['查件', '催件', '下单', '拒识', '非需求场景'])]
+print(df)
+# Assume df is the original DataFrame you want to split
+train_df, test_df = train_test_split(df, test_size=0.1, train_size=0.9,
+                                     random_state=42)
+# Remove the white space from the columns
+train_df = train_df.apply(lambda x: x.str.strip())
+test_df = test_df.apply(lambda x: x.str.strip())
+print(train_df)
+print(test_df)
+# Concatenate the columns with a whitespace separator
+train_df = train_df.iloc[:, 0] + train_df.iloc[:, 1].apply(
+    lambda x: ':' + str(x))
+test_df = test_df.iloc[:, 0] + test_df.iloc[:, 1].apply(lambda x: ':' + str(x))
+# Set the display options for left alignment
+pd.options.display.max_colwidth = None
+pd.options.display.colheader_justify = 'left'
+# Print and write the DataFrames to text files
+with open('data\output.txt', 'w', encoding='utf-8') as f:
+    output = train_df.to_string(index=False, header=False).replace(' ', '')
+    # output = output.replace(':', '\t')
+    f.write(output)
+    f.write('\n')
+with open('data\output2.txt', 'w', encoding='utf-8') as f:
+    output = test_df.to_string(index=False, header=False).replace(' ', '')
+    # output = output.replace(':', '\t')
+    f.write(output)
+    f.write('\n')