Spaces:

zhangj726
/

poem_generation

Configuration error

App Files Files Community

zhangj726 commited on Jul 2, 2023

Commit

5f1a2a0

1 Parent(s): fce2a66

Upload 31 files

Browse files

Files changed (31) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-38.pyc +0 -0
src/__pycache__/__init__.cpython-39.pyc +0 -0
src/apis/__init__.py +0 -0
src/apis/__pycache__/__init__.cpython-39.pyc +0 -0
src/apis/__pycache__/inference.cpython-39.pyc +0 -0
src/apis/__pycache__/train.cpython-39.pyc +0 -0
src/apis/evaluate.py +23 -0
src/apis/train.py +68 -0
src/datasets/__init__.py +0 -0
src/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
src/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
src/datasets/__pycache__/dataloader.cpython-38.pyc +0 -0
src/datasets/__pycache__/dataloader.cpython-39.pyc +0 -0
src/datasets/dataloader.py +115 -0
src/models/LSTM/__init__.py +0 -0
src/models/LSTM/__pycache__/__init__.cpython-38.pyc +0 -0
src/models/LSTM/__pycache__/__init__.cpython-39.pyc +0 -0
src/models/LSTM/__pycache__/algorithm.cpython-39.pyc +0 -0
src/models/LSTM/__pycache__/model.cpython-38.pyc +0 -0
src/models/LSTM/__pycache__/model.cpython-39.pyc +0 -0
src/models/LSTM/model.py +37 -0
src/models/__init__.py +0 -0
src/models/__pycache__/__init__.cpython-38.pyc +0 -0
src/models/__pycache__/__init__.cpython-39.pyc +0 -0
src/utils/__init__.py +0 -0
src/utils/__pycache__/__init__.cpython-38.pyc +0 -0
src/utils/__pycache__/__init__.cpython-39.pyc +0 -0
src/utils/__pycache__/utils.cpython-38.pyc +0 -0
src/utils/__pycache__/utils.cpython-39.pyc +0 -0
src/utils/utils.py +15 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (166 Bytes). View file

src/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (146 Bytes). View file

src/apis/__init__.py ADDED Viewed

File without changes

src/apis/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (151 Bytes). View file

src/apis/__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (1.44 kB). View file

src/apis/__pycache__/train.cpython-39.pyc ADDED Viewed

Binary file (1.68 kB). View file

src/apis/evaluate.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+import numpy as np
+from src.models.EA_LSTM.model import weightedLSTM
+from src.datasets.dataloader import MyDataset, create_vocab
+def test(args):
+    vocab, poetrys = create_vocab(args.data)
+    # 词汇表长度
+    args.vocab_size = len(vocab)
+    int2char = np.array(vocab)
+    valid_dataset = MyDataset(vocab, poetrys, args, train=False)
+    model = weightedLSTM(6110, 256, 128, 2, [1.0] * 80, False)
+    model.load_state_dict(torch.load(args.save_path))
+    input_example_batch, target_example_batch = valid_dataset[0]
+    example_batch_predictions = model(input_example_batch)
+    predicted_id = torch.distributions.Categorical(example_batch_predictions).sample()
+    predicted_id = torch.squeeze(predicted_id, -1).numpy()
+    print("Input: \n", repr("".join(int2char[input_example_batch])))
+    print()
+    print("Predictions: \n", repr("".join(int2char[predicted_id])))

src/apis/train.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import math
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.optim as optim
+from src.utils.utils import make_cuda
+from torch.nn import functional as F
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+def train(args, model, data_loader, initial=False):
+    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
+    model.train()
+    num_epochs = args.initial_epochs if initial else args.num_epochs
+    for epoch in range(num_epochs):
+        loss = 0
+        for step, (features, targets) in enumerate(data_loader):
+            features = make_cuda(features)
+            targets = make_cuda(targets)
+            optimizer.zero_grad()
+            pre, _ = model(features)
+            crs_loss = model.cross_entropy(pre, targets.reshape(-1))
+            loss += crs_loss.item()
+            crs_loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+            optimizer.step()
+            # print step info
+            if (step + 1) % args.log_step == 0:
+                print("Epoch [%.3d/%.3d] Step [%.3d/%.3d]: CROSS_loss=%.4f, RCROSS_loss=%.4f"
+                      % (epoch + 1,
+                         num_epochs,
+                         step + 1,
+                         len(data_loader),
+                         loss / args.log_step,
+                         math.sqrt(loss / args.log_step)))
+                loss = 0
+        # Loss = []
+        # for step, (features, targets) in enumerate(valid_data_loader):
+        #     features = make_cuda(features)
+        #     targets = make_cuda(targets)
+        #     model.eval()
+        #     preds = model(features)
+        #     valid_loss = CrossLoss(preds, targets)
+        #     Loss.append(valid_loss)
+        # print("Valid loss: %.3d\n" % (np.mean(Loss)))
+    return model
+def evaluate(args, model, data_loader):
+    model.eval()
+    loss = []
+    for step, (features, targets) in enumerate(data_loader):
+        features = make_cuda(features)
+        targets = make_cuda(targets)
+        pre, _ = model(features)
+        crs_loss = model.cross_entropy(pre, targets.reshape(-1))
+        loss.append(crs_loss.item())
+        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+    print("loss=%.4f" % (np.mean(loss)))

src/datasets/__init__.py ADDED Viewed

File without changes

src/datasets/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (175 Bytes). View file

src/datasets/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (155 Bytes). View file

src/datasets/__pycache__/dataloader.cpython-38.pyc ADDED Viewed

Binary file (4.09 kB). View file

src/datasets/__pycache__/dataloader.cpython-39.pyc ADDED Viewed

Binary file (4.12 kB). View file

src/datasets/dataloader.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import numpy as np
+import pickle
+import os
+import torch
+import torch.nn as nn
+from gensim.models.word2vec import Word2Vec
+from torch.utils.data import Dataset
+def padding(poetries, maxlen, pad):
+    batch_seq = [poetry + pad * (maxlen - len(poetry)) for poetry in poetries]
+    return batch_seq
+# 输入向后滑一字符为target，即预测下一个字
+def split_input_target(seq):
+    inputs = seq[:-1]
+    targets = seq[1:]
+    return inputs, targets
+# 创建词汇表
+def get_poetry(arg):
+    poetrys = []
+    if arg.Augmented_dataset:
+        path = arg.Augmented_data
+    else:
+        path = arg.data
+    with open(path, "r", encoding='UTF-8') as f:
+        for line in f:
+            try:
+                # line = line.decode('UTF-8')
+                line = line.strip(u'\n')
+                if arg.Augmented_dataset:
+                    content = line.strip(u' ')
+                else:
+                    title, content = line.strip(u' ').split(u':')
+                content = content.replace(u' ', u'')
+                if u'_' in content or u'(' in content or u'（' in content or u'《' in content or u'[' in content:
+                    continue
+                if arg.strict_dataset:
+                    if len(content) < 12 or len(content) > 79:
+                        continue
+                else:
+                    if len(content) < 5 or len(content) > 79:
+                        continue
+                content = u'[' + content + u']'
+                poetrys.append(content)
+            except Exception as e:
+                pass
+            # 按诗的字数排序
+    poetrys = sorted(poetrys, key=lambda line: len(line))
+    with open("data/org_poetry.txt", "w", encoding="utf-8") as f:
+        for poetry in poetrys:
+            poetry = str(poetry).strip('[').strip(']').replace(',', '').replace('\'', '') + '\n'
+            f.write(poetry)
+    return poetrys
+# 切分文档
+def split_text(poetrys):
+    with open("data/split_poetry.txt", "w", encoding="utf-8") as f:
+        for poetry in poetrys:
+            poetry = str(poetry).strip('[').strip(']').replace(',', '').replace('\'', '') + '\n '
+            split_data = " ".join(poetry)
+            f.write(split_data)
+    return open("data/split_poetry.txt", "r", encoding='UTF-8').read()
+# 训练词向量
+def train_vec(split_file="data/split_poetry.txt", org_file="data/org_poetry.txt"):
+    param_file = "data/word_vec.pkl"
+    org_data = open(org_file, "r", encoding="utf-8").read().split("\n")
+    if os.path.exists(split_file):
+        all_data_split = open(split_file, "r", encoding="utf-8").read().split("\n")
+    else:
+        all_data_split = split_text().split("\n")
+    if os.path.exists(param_file):
+        return org_data, pickle.load(open(param_file, "rb"))
+    models = Word2Vec(all_data_split, vector_size=256, workers=7, min_count=1)
+    pickle.dump([models.syn1neg, models.wv.key_to_index, models.wv.index_to_key], open(param_file, "wb"))
+    return org_data, (models.syn1neg, models.wv.key_to_index, models.wv.index_to_key)
+class Poetry_Dataset(Dataset):
+    def __init__(self, w1, word_2_index, all_data, Word2Vec):
+        self.Word2Vec = Word2Vec
+        self.w1 = w1
+        self.word_2_index = word_2_index
+        word_size, embedding_num = w1.shape
+        self.embedding = nn.Embedding(word_size, embedding_num)
+        # 最长句子长度
+        maxlen = max([len(seq) for seq in all_data])
+        pad = ' '
+        self.all_data = padding(all_data[:-1], maxlen, pad)
+    def __getitem__(self, index):
+        a_poetry = self.all_data[index]
+        a_poetry_index = [self.word_2_index[i] for i in a_poetry]
+        xs, ys = split_input_target(a_poetry_index)
+        if self.Word2Vec:
+            xs_embedding = self.w1[xs]
+        else:
+            xs_embedding = np.array(xs)
+        return xs_embedding, np.array(ys).astype(np.int64)
+    def __len__(self):
+        return len(self.all_data)

src/models/LSTM/__init__.py ADDED Viewed

File without changes

src/models/LSTM/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/models/LSTM/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (161 Bytes). View file

src/models/LSTM/__pycache__/algorithm.cpython-39.pyc ADDED Viewed

Binary file (4.99 kB). View file

src/models/LSTM/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (1.58 kB). View file

src/models/LSTM/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (1.55 kB). View file

src/models/LSTM/model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import numpy as np
+import torch.nn as nn
+class Poetry_Model_lstm(nn.Module):
+    def __init__(self, hidden_num, word_size, embedding_num, Word2Vec):
+        super().__init__()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.hidden_num = hidden_num
+        self.Word2Vec = Word2Vec
+        self.embedding = nn.Embedding(word_size, embedding_num)
+        self.lstm = nn.LSTM(input_size=embedding_num, hidden_size=hidden_num, batch_first=True, num_layers=2,
+                            bidirectional=False)
+        self.dropout = nn.Dropout(0.3)
+        self.flatten = nn.Flatten(0, 1)
+        self.linear = nn.Linear(hidden_num, word_size)
+        self.cross_entropy = nn.CrossEntropyLoss()
+    def forward(self, xs_embedding, h_0=None, c_0=None):
+        # xs_embedding: [batch_size, max_seq_len, n_feature] n_feature=128
+        if h_0 == None or c_0 == None:
+            h_0 = torch.tensor(np.zeros((2, xs_embedding.shape[0], self.hidden_num), dtype=np.float32))
+            c_0 = torch.tensor(np.zeros((2, xs_embedding.shape[0], self.hidden_num), dtype=np.float32))
+        h_0 = h_0.to(self.device)
+        c_0 = c_0.to(self.device)
+        xs_embedding = xs_embedding.to(self.device)
+        if not self.Word2Vec:
+            xs_embedding = self.embedding(xs_embedding)
+        hidden, (h_0, c_0) = self.lstm(xs_embedding, (h_0, c_0))
+        hidden_drop = self.dropout(hidden)
+        hidden_flatten = self.flatten(hidden_drop)
+        pre = self.linear(hidden_flatten)
+        # pre：[batch_size*max_seq_len, vocab_size]
+        return pre, (h_0, c_0)

src/models/__init__.py ADDED Viewed

File without changes

src/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (173 Bytes). View file

src/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (153 Bytes). View file

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (172 Bytes). View file

src/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (152 Bytes). View file

src/utils/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (575 Bytes). View file

src/utils/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (555 Bytes). View file

src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+def make_cuda(tensor):
+    """Use CUDA if it's available."""
+    if torch.cuda.is_available():
+        tensor = tensor.cuda()
+    return tensor
+def is_minimum(value, indiv_to_rmse):
+    if len(indiv_to_rmse) == 0:
+        return True
+    temp = list(indiv_to_rmse.values())
+    return True if value < min(temp) else False