File size: 4,013 Bytes
421645e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Natural Language Toolkit: Utility functions
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

from itertools import chain

def pad_sequence(

    sequence,

    n,

    pad_left=False,

    pad_right=False,

    left_pad_symbol=None,

    right_pad_symbol=None,

):
    """

    Returns a padded sequence of items before ngram extraction.

        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

        ['<s>', 1, 2, 3, 4, 5, '</s>']

        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))

        ['<s>', 1, 2, 3, 4, 5]

        >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))

        [1, 2, 3, 4, 5, '</s>']

    :param sequence: the source data to be padded

    :type sequence: sequence or iter

    :param n: the degree of the ngrams

    :type n: int

    :param pad_left: whether the ngrams should be left-padded

    :type pad_left: bool

    :param pad_right: whether the ngrams should be right-padded

    :type pad_right: bool

    :param left_pad_symbol: the symbol to use for left padding (default is None)

    :type left_pad_symbol: any

    :param right_pad_symbol: the symbol to use for right padding (default is None)

    :type right_pad_symbol: any

    :rtype: sequence or iter

    """
    sequence = iter(sequence)
    if pad_left:
        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence


# add a flag to pad the sequence so we get peripheral ngrams?


def ngrams(

    sequence,

    n,

    pad_left=False,

    pad_right=False,

    left_pad_symbol=None,

    right_pad_symbol=None,

):
    """

    Return the ngrams generated from a sequence of items, as an iterator.

    For example:

        >>> from nltk.util import ngrams

        >>> list(ngrams([1,2,3,4,5], 3))

        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

    Wrap with list for a list version of this function.  Set pad_left

    or pad_right to true in order to get additional ngrams:

        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))

        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]

        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))

        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]

        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))

        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]

        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]

    :param sequence: the source data to be converted into ngrams

    :type sequence: sequence or iter

    :param n: the degree of the ngrams

    :type n: int

    :param pad_left: whether the ngrams should be left-padded

    :type pad_left: bool

    :param pad_right: whether the ngrams should be right-padded

    :type pad_right: bool

    :param left_pad_symbol: the symbol to use for left padding (default is None)

    :type left_pad_symbol: any

    :param right_pad_symbol: the symbol to use for right padding (default is None)

    :type right_pad_symbol: any

    :rtype: sequence or iter

    """
    sequence = pad_sequence(
        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
    )

    history = []
    while n > 1:
        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
        try:
            next_item = next(sequence)
        except StopIteration:
            # no more data, terminate the generator
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]