nevmenandr
/

char-based-tensorflow-russian-tolkien-onomastics

+import tensorflow as tf
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import seq2seq
+import random
+import numpy as np
+class Model():
+    def __init__(self, args, infer=False):
+        self.args = args
+        if infer:
+            args.batch_size = 1
+            args.seq_length = 1
+        if args.model == 'rnn':
+            cell_fn = rnn_cell.BasicRNNCell
+        elif args.model == 'gru':
+            cell_fn = rnn_cell.GRUCell
+        elif args.model == 'lstm':
+            cell_fn = rnn_cell.BasicLSTMCell
+        else:
+            raise Exception("model type not supported: {}".format(args.model))
+        cell = cell_fn(args.rnn_size)
+        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)
+        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
+        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
+        self.initial_state = cell.zero_state(args.batch_size, tf.float32)
+        with tf.variable_scope('rnnlm'):
+            softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
+            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
+            with tf.device("/cpu:0"):
+                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
+                inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
+                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
+        def loop(prev, _):
+            prev = tf.matmul(prev, softmax_w) + softmax_b
+            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
+            return tf.nn.embedding_lookup(embedding, prev_symbol)
+        outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm')
+        output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
+        self.logits = tf.matmul(output, softmax_w) + softmax_b
+        self.probs = tf.nn.softmax(self.logits)
+        loss = seq2seq.sequence_loss_by_example([self.logits],
+                [tf.reshape(self.targets, [-1])],
+                [tf.ones([args.batch_size * args.seq_length])],
+                args.vocab_size)
+        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
+        self.final_state = last_state
+        self.lr = tf.Variable(0.0, trainable=False)
+        tvars = tf.trainable_variables()
+        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
+                args.grad_clip)
+        optimizer = tf.train.AdamOptimizer(self.lr)
+        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
+    def sample(self, sess, words, vocab, num=200, prime='first all', sampling_type=1):
+        state = sess.run(self.cell.zero_state(1, tf.float32))
+        if not len(prime) or prime == " ":
+            prime  = random.choice(list(vocab.keys()))
+        print (prime)
+        for word in prime.split()[:-1]:
+            print (word)
+            x = np.zeros((1, 1))
+            x[0, 0] = vocab.get(word,0)
+            feed = {self.input_data: x, self.initial_state:state}
+            [state] = sess.run([self.final_state], feed)
+        def weighted_pick(weights):
+            t = np.cumsum(weights)
+            s = np.sum(weights)
+            return(int(np.searchsorted(t, np.random.rand(1)*s)))
+        ret = prime
+        word = prime.split()[-1]
+        for n in range(num):
+            x = np.zeros((1, 1))
+            x[0, 0] = vocab.get(word,0)
+            feed = {self.input_data: x, self.initial_state:state}
+            [probs, state] = sess.run([self.probs, self.final_state], feed)
+            p = probs[0]
+            if sampling_type == 0:
+                sample = np.argmax(p)
+            elif sampling_type == 2:
+                if word == '\n':
+                    sample = weighted_pick(p)
+                else:
+                    sample = np.argmax(p)
+            else: # sampling_type == 1 default:
+                sample = weighted_pick(p)
+            pred = words[sample]
+            ret += ' ' + pred
+            word = pred
+        return ret

sample.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+import argparse
+import time
+import os
+from six.moves import cPickle
+from utils import TextLoader
+from model import Model
+from six import text_type
+import re
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--save_dir', type=str, default='./save',
+                       help='model directory to store checkpointed models')
+    parser.add_argument('-n', type=int, default=800,
+                       help='number of characters to sample')
+    parser.add_argument('--prime', type=text_type, default=u'Промхимия ',
+                       help='prime text')
+    parser.add_argument('--sample', type=int, default=1,
+                       help='0 to use max at each timestep, 1 to sample at each timestep, 2 to sample on spaces')
+    args = parser.parse_args()
+    sample(args)
+def sample(args):
+    with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f:
+        saved_args = cPickle.load(f)
+    with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f:
+        chars, vocab = cPickle.load(f)
+    model = Model(saved_args, True)
+    with tf.Session() as sess:
+        tf.initialize_all_variables().run()
+        saver = tf.train.Saver(tf.all_variables())
+        ckpt = tf.train.get_checkpoint_state(args.save_dir)
+        if ckpt and ckpt.model_checkpoint_path:
+            saver.restore(sess, ckpt.model_checkpoint_path)
+            #print(model.sample(sess, chars, vocab, args.n, args.prime, args.sample))
+            sample_string = model.sample(sess, chars, vocab, args.n, args.prime, args.sample)
+            sample_string = re.sub(u' ([^ ])', u'\\1', sample_string)
+            sample_string = re.sub(u'[ ]+', u' ', sample_string)
+            print(sample_string)
+if __name__ == '__main__':
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import codecs
+import os
+import collections
+from six.moves import cPickle
+import numpy as np
+class TextLoader():
+    def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'):
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.encoding = encoding
+        input_file = os.path.join(data_dir, "input.txt")
+        vocab_file = os.path.join(data_dir, "vocab.pkl")
+        tensor_file = os.path.join(data_dir, "data.npy")
+        if not (os.path.exists(vocab_file) and os.path.exists(tensor_file)):
+            print("reading text file")
+            self.preprocess(input_file, vocab_file, tensor_file)
+        else:
+            print("loading preprocessed files")
+            self.load_preprocessed(vocab_file, tensor_file)
+        self.create_batches()
+        self.reset_batch_pointer()
+    def preprocess(self, input_file, vocab_file, tensor_file):
+        with codecs.open(input_file, "r", encoding=self.encoding) as f:
+            data = f.read()
+        counter = collections.Counter(data)
+        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
+        self.chars, _ = zip(*count_pairs)
+        self.vocab_size = len(self.chars)
+        self.vocab = dict(zip(self.chars, range(len(self.chars))))
+        with open(vocab_file, 'wb') as f:
+            cPickle.dump(self.chars, f)
+        self.tensor = np.array(list(map(self.vocab.get, data)))
+        np.save(tensor_file, self.tensor)
+    def load_preprocessed(self, vocab_file, tensor_file):
+        with open(vocab_file, 'rb') as f:
+            self.chars = cPickle.load(f)
+        self.vocab_size = len(self.chars)
+        self.vocab = dict(zip(self.chars, range(len(self.chars))))
+        self.tensor = np.load(tensor_file)
+        self.num_batches = int(self.tensor.size / (self.batch_size *
+                                                   self.seq_length))
+    def create_batches(self):
+        self.num_batches = int(self.tensor.size / (self.batch_size *
+                                                   self.seq_length))
+        # When the data (tesor) is too small, let's give them a better error message
+        if self.num_batches==0:
+            assert False, "Not enough data. Make seq_length and batch_size small."
+        self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
+        xdata = self.tensor
+        ydata = np.copy(self.tensor)
+        ydata[:-1] = xdata[1:]
+        ydata[-1] = xdata[0]
+        self.x_batches = np.split(xdata.reshape(self.batch_size, -1), self.num_batches, 1)
+        self.y_batches = np.split(ydata.reshape(self.batch_size, -1), self.num_batches, 1)
+    def next_batch(self):
+        x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
+        self.pointer += 1
+        return x, y
+    def reset_batch_pointer(self):
+        self.pointer = 0