File size: 4,710 Bytes
1d4f575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
824146a
 
 
 
 
 
1d4f575
824146a
 
 
 
 
 
1d4f575
 
 
 
 
 
 
 
 
 
61aa7f2
1d4f575
 
61aa7f2
 
1d4f575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# based on markov.py by Allison Parish
# https://github.com/aparrish/rwet-examples/blob/master/ngrams/markov.py

import random

def build_model(tokens, n):
	"Builds a Markov model from the list of tokens, using n-grams of length n."
	model = dict()
	if len(tokens) < n:
		return model
	for i in range(len(tokens) - n):
		gram = tuple(tokens[i:i+n])
		next_token = tokens[i+n]
		if gram in model:
			model[gram].append(next_token)
		else:
			model[gram] = [next_token]
	final_gram = tuple(tokens[len(tokens)-n:])
	# if final_gram in model:
	# 	model[final_gram].append(None)
	# else:
	# 	model[final_gram] = [None]
	return model

def generate(model, n, seed=None, max_iterations=100):
	"""Generates a list of tokens from information in model, using n as the
		length of n-grams in the model. Starts the generation with the n-gram
		given as seed. If more than max_iteration iterations are reached, the
		process is stopped. (This is to prevent infinite loops)""" 
	if seed is None:
		seed = random.choice(list(model.keys()))
	else:
		seed = (seed,)
	output = list(seed)
	current = tuple(seed)
	for i in range(max_iterations):
		if current in model:
			possible_next_tokens = model[current]
			next_token = random.choice(possible_next_tokens)
			if next_token is None:
				print('next token is none')
				break
			output.append(next_token)
			current = tuple(output[-n:])
		else:
			break
	# print 'output: ' + output[1]
	return output

def merge_models(models):
	"Merges two or more Markov models."
	merged_model = dict()
	for model in models:
		for key, val in model.items():
			if key in merged_model:
				merged_model[key].extend(val)
			else:
				merged_model[key] = val
	return merged_model

def generate_from_token_lists(token_lines, n, count=14, max_iterations=100):
	"""Generates text from a list of lists of tokens. This function is intended
		for input text where each line forms a distinct unit (e.g., poetry), and
		where the desired output is to recreate lines in that form. It does this
		by keeping track of the n-gram that comes at the beginning of each line,
		and then only generating lines that begin with one of these "beginnings."
		It also builds a separate Markov model for each line, and then merges
		those models together, to ensure that lines end with n-grams statistically
		likely to end lines in the original text.""" 
	beginnings = list()
	models = list()
	for token_line in token_lines:
		beginning = token_line[:n]
		beginnings.append(beginning)
		line_model = build_model(token_line, n)
		models.append(line_model)
	combined_model = merge_models(models)
	generated_list = list()
	for i in range(count):
		generated_str = generate(combined_model, n, random.choice(beginnings),
				max_iterations)	
		generated_list.append(generated_str)
	return generated_list

# def char_level_generate(lines, n, count=14, max_iterations=100):
# 	"""Generates Markov chain text from the given lines, using character-level
# 		n-grams of length n. Returns a list of count items."""
# 	token_lines = [list(line) for line in lines]
# 	generated = generate_from_token_lists(token_lines, n, count, max_iterations)
# 	return [''.join(item) for item in generated]

# def word_level_generate(lines, n, count=14, max_iterations=100):
# 	"""Generates Markov chain text from the given lines, using word-level
# 		n-grams of length n. Returns a list of count items."""
# 	token_lines = [line.split() for line in lines]
# 	generated = generate_from_token_lists(token_lines, n, count, max_iterations)
# 	return [' '.join(item) for item in generated]

def generate_model_from_token_lists(token_lines, n, count=14, max_iterations=100):
	"""Generates text from a list of lists of tokens. This function is intended
		for input text where each line forms a distinct unit (e.g., poetry), and
		where the desired output is to recreate lines in that form. It does this
		by keeping track of the n-gram that comes at the beginning of each line,
		and then only generating lines that begin with one of these "beginnings."
		It also builds a separate Markov model for each line, and then merges
		those models together, to ensure that lines end with n-grams statistically
		likely to end lines in the original text.""" 
	# beginnings = list()
	models = list()
	for token_line in token_lines:
		# beginning = token_line[:n]
		# beginnings.append(beginning)
		line_model = build_model(token_line, n)
		models.append(line_model)
	combined_model = merge_models(models)
	return combined_model


# if __name__ == '__main__':
# 	import sys
# 	n = int(sys.argv[1])
# 	lines = list()
# 	for line in sys.stdin:
# 		line = line.strip()
# 		lines.append(line)
# 	for generated in char_level_generate(lines, n):
# 		print(generated)