# Natural Language Toolkit: Utility functions # # Copyright (C) 2001-2020 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from itertools import chain def pad_sequence( sequence, n, pad_left=False, pad_right=False, left_pad_symbol=None, right_pad_symbol=None, ): """ Returns a padded sequence of items before ngram extraction. >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) ['', 1, 2, 3, 4, 5, ''] >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) ['', 1, 2, 3, 4, 5] >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) [1, 2, 3, 4, 5, ''] :param sequence: the source data to be padded :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param pad_left: whether the ngrams should be left-padded :type pad_left: bool :param pad_right: whether the ngrams should be right-padded :type pad_right: bool :param left_pad_symbol: the symbol to use for left padding (default is None) :type left_pad_symbol: any :param right_pad_symbol: the symbol to use for right padding (default is None) :type right_pad_symbol: any :rtype: sequence or iter """ sequence = iter(sequence) if pad_left: sequence = chain((left_pad_symbol,) * (n - 1), sequence) if pad_right: sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) return sequence # add a flag to pad the sequence so we get peripheral ngrams? def ngrams( sequence, n, pad_left=False, pad_right=False, left_pad_symbol=None, right_pad_symbol=None, ): """ Return the ngrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import ngrams >>> list(ngrams([1,2,3,4,5], 3)) [(1, 2, 3), (2, 3, 4), (3, 4, 5)] Wrap with list for a list version of this function. Set pad_left or pad_right to true in order to get additional ngrams: >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) [(1, 2), (2, 3), (3, 4), (4, 5), (5, '')] >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) [('', 1), (1, 2), (2, 3), (3, 4), (4, 5)] >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) [('', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '')] :param sequence: the source data to be converted into ngrams :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param pad_left: whether the ngrams should be left-padded :type pad_left: bool :param pad_right: whether the ngrams should be right-padded :type pad_right: bool :param left_pad_symbol: the symbol to use for left padding (default is None) :type left_pad_symbol: any :param right_pad_symbol: the symbol to use for right padding (default is None) :type right_pad_symbol: any :rtype: sequence or iter """ sequence = pad_sequence( sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol ) history = [] while n > 1: # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator try: next_item = next(sequence) except StopIteration: # no more data, terminate the generator return history.append(next_item) n -= 1 for item in sequence: history.append(item) yield tuple(history) del history[0]