# based on markov.py by Allison Parish # https://github.com/aparrish/rwet-examples/blob/master/ngrams/markov.py import random def build_model(tokens, n): "Builds a Markov model from the list of tokens, using n-grams of length n." model = dict() if len(tokens) < n: return model for i in range(len(tokens) - n): gram = tuple(tokens[i:i+n]) next_token = tokens[i+n] if gram in model: model[gram].append(next_token) else: model[gram] = [next_token] final_gram = tuple(tokens[len(tokens)-n:]) # if final_gram in model: # model[final_gram].append(None) # else: # model[final_gram] = [None] return model def generate(model, n, seed=None, max_iterations=100): """Generates a list of tokens from information in model, using n as the length of n-grams in the model. Starts the generation with the n-gram given as seed. If more than max_iteration iterations are reached, the process is stopped. (This is to prevent infinite loops)""" if seed is None: seed = random.choice(list(model.keys())) else: seed = (seed,) output = list(seed) current = tuple(seed) for i in range(max_iterations): if current in model: possible_next_tokens = model[current] next_token = random.choice(possible_next_tokens) if next_token is None: print('next token is none') break output.append(next_token) current = tuple(output[-n:]) else: break # print 'output: ' + output[1] return output def merge_models(models): "Merges two or more Markov models." merged_model = dict() for model in models: for key, val in model.items(): if key in merged_model: merged_model[key].extend(val) else: merged_model[key] = val return merged_model def generate_from_token_lists(token_lines, n, count=14, max_iterations=100): """Generates text from a list of lists of tokens. This function is intended for input text where each line forms a distinct unit (e.g., poetry), and where the desired output is to recreate lines in that form. It does this by keeping track of the n-gram that comes at the beginning of each line, and then only generating lines that begin with one of these "beginnings." It also builds a separate Markov model for each line, and then merges those models together, to ensure that lines end with n-grams statistically likely to end lines in the original text.""" beginnings = list() models = list() for token_line in token_lines: beginning = token_line[:n] beginnings.append(beginning) line_model = build_model(token_line, n) models.append(line_model) combined_model = merge_models(models) generated_list = list() for i in range(count): generated_str = generate(combined_model, n, random.choice(beginnings), max_iterations) generated_list.append(generated_str) return generated_list # def char_level_generate(lines, n, count=14, max_iterations=100): # """Generates Markov chain text from the given lines, using character-level # n-grams of length n. Returns a list of count items.""" # token_lines = [list(line) for line in lines] # generated = generate_from_token_lists(token_lines, n, count, max_iterations) # return [''.join(item) for item in generated] # def word_level_generate(lines, n, count=14, max_iterations=100): # """Generates Markov chain text from the given lines, using word-level # n-grams of length n. Returns a list of count items.""" # token_lines = [line.split() for line in lines] # generated = generate_from_token_lists(token_lines, n, count, max_iterations) # return [' '.join(item) for item in generated] def generate_model_from_token_lists(token_lines, n, count=14, max_iterations=100): """Generates text from a list of lists of tokens. This function is intended for input text where each line forms a distinct unit (e.g., poetry), and where the desired output is to recreate lines in that form. It does this by keeping track of the n-gram that comes at the beginning of each line, and then only generating lines that begin with one of these "beginnings." It also builds a separate Markov model for each line, and then merges those models together, to ensure that lines end with n-grams statistically likely to end lines in the original text.""" # beginnings = list() models = list() for token_line in token_lines: # beginning = token_line[:n] # beginnings.append(beginning) line_model = build_model(token_line, n) models.append(line_model) combined_model = merge_models(models) return combined_model # if __name__ == '__main__': # import sys # n = int(sys.argv[1]) # lines = list() # for line in sys.stdin: # line = line.strip() # lines.append(line) # for generated in char_level_generate(lines, n): # print(generated)