visakh7843's picture
Bug fixes and minor UI changes
61aa7f2
raw
history blame
4.71 kB
# based on markov.py by Allison Parish
# https://github.com/aparrish/rwet-examples/blob/master/ngrams/markov.py
import random
def build_model(tokens, n):
"Builds a Markov model from the list of tokens, using n-grams of length n."
model = dict()
if len(tokens) < n:
return model
for i in range(len(tokens) - n):
gram = tuple(tokens[i:i+n])
next_token = tokens[i+n]
if gram in model:
model[gram].append(next_token)
else:
model[gram] = [next_token]
final_gram = tuple(tokens[len(tokens)-n:])
# if final_gram in model:
# model[final_gram].append(None)
# else:
# model[final_gram] = [None]
return model
def generate(model, n, seed=None, max_iterations=100):
"""Generates a list of tokens from information in model, using n as the
length of n-grams in the model. Starts the generation with the n-gram
given as seed. If more than max_iteration iterations are reached, the
process is stopped. (This is to prevent infinite loops)"""
if seed is None:
seed = random.choice(list(model.keys()))
else:
seed = (seed,)
output = list(seed)
current = tuple(seed)
for i in range(max_iterations):
if current in model:
possible_next_tokens = model[current]
next_token = random.choice(possible_next_tokens)
if next_token is None:
print('next token is none')
break
output.append(next_token)
current = tuple(output[-n:])
else:
break
# print 'output: ' + output[1]
return output
def merge_models(models):
"Merges two or more Markov models."
merged_model = dict()
for model in models:
for key, val in model.items():
if key in merged_model:
merged_model[key].extend(val)
else:
merged_model[key] = val
return merged_model
def generate_from_token_lists(token_lines, n, count=14, max_iterations=100):
"""Generates text from a list of lists of tokens. This function is intended
for input text where each line forms a distinct unit (e.g., poetry), and
where the desired output is to recreate lines in that form. It does this
by keeping track of the n-gram that comes at the beginning of each line,
and then only generating lines that begin with one of these "beginnings."
It also builds a separate Markov model for each line, and then merges
those models together, to ensure that lines end with n-grams statistically
likely to end lines in the original text."""
beginnings = list()
models = list()
for token_line in token_lines:
beginning = token_line[:n]
beginnings.append(beginning)
line_model = build_model(token_line, n)
models.append(line_model)
combined_model = merge_models(models)
generated_list = list()
for i in range(count):
generated_str = generate(combined_model, n, random.choice(beginnings),
max_iterations)
generated_list.append(generated_str)
return generated_list
# def char_level_generate(lines, n, count=14, max_iterations=100):
# """Generates Markov chain text from the given lines, using character-level
# n-grams of length n. Returns a list of count items."""
# token_lines = [list(line) for line in lines]
# generated = generate_from_token_lists(token_lines, n, count, max_iterations)
# return [''.join(item) for item in generated]
# def word_level_generate(lines, n, count=14, max_iterations=100):
# """Generates Markov chain text from the given lines, using word-level
# n-grams of length n. Returns a list of count items."""
# token_lines = [line.split() for line in lines]
# generated = generate_from_token_lists(token_lines, n, count, max_iterations)
# return [' '.join(item) for item in generated]
def generate_model_from_token_lists(token_lines, n, count=14, max_iterations=100):
"""Generates text from a list of lists of tokens. This function is intended
for input text where each line forms a distinct unit (e.g., poetry), and
where the desired output is to recreate lines in that form. It does this
by keeping track of the n-gram that comes at the beginning of each line,
and then only generating lines that begin with one of these "beginnings."
It also builds a separate Markov model for each line, and then merges
those models together, to ensure that lines end with n-grams statistically
likely to end lines in the original text."""
# beginnings = list()
models = list()
for token_line in token_lines:
# beginning = token_line[:n]
# beginnings.append(beginning)
line_model = build_model(token_line, n)
models.append(line_model)
combined_model = merge_models(models)
return combined_model
# if __name__ == '__main__':
# import sys
# n = int(sys.argv[1])
# lines = list()
# for line in sys.stdin:
# line = line.strip()
# lines.append(line)
# for generated in char_level_generate(lines, n):
# print(generated)