File size: 10,432 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#
# Natural Language Toolkit: Sentiment Analyzer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks

using NLTK features and classifiers, especially for teaching and demonstrative

purposes.

"""

import sys
from collections import defaultdict

from nltk.classify.util import accuracy as eval_accuracy
from nltk.classify.util import apply_features
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import f_measure as eval_f_measure
from nltk.metrics import precision as eval_precision
from nltk.metrics import recall as eval_recall
from nltk.probability import FreqDist


class SentimentAnalyzer:
    """

    A Sentiment Analysis tool based on machine learning approaches.

    """

    def __init__(self, classifier=None):
        self.feat_extractors = defaultdict(list)
        self.classifier = classifier

    def all_words(self, documents, labeled=None):
        """

        Return all words/tokens from the documents (with duplicates).



        :param documents: a list of (words, label) tuples.

        :param labeled: if `True`, assume that each document is represented by a

            (words, label) tuple: (list(str), str). If `False`, each document is

            considered as being a simple list of strings: list(str).

        :rtype: list(str)

        :return: A list of all words/tokens in `documents`.

        """
        all_words = []
        if labeled is None:
            labeled = documents and isinstance(documents[0], tuple)
        if labeled:
            for words, _sentiment in documents:
                all_words.extend(words)
        elif not labeled:
            for words in documents:
                all_words.extend(words)
        return all_words

    def apply_features(self, documents, labeled=None):
        """

        Apply all feature extractor functions to the documents. This is a wrapper

        around `nltk.classify.util.apply_features`.



        If `labeled=False`, return featuresets as:

            [feature_func(doc) for doc in documents]

        If `labeled=True`, return featuresets as:

            [(feature_func(tok), label) for (tok, label) in toks]



        :param documents: a list of documents. `If labeled=True`, the method expects

            a list of (words, label) tuples.

        :rtype: LazyMap

        """
        return apply_features(self.extract_features, documents, labeled)

    def unigram_word_feats(self, words, top_n=None, min_freq=0):
        """

        Return most common top_n word features.



        :param words: a list of words/tokens.

        :param top_n: number of best words/tokens to use, sorted by frequency.

        :rtype: list(str)

        :return: A list of `top_n` words/tokens (with no duplicates) sorted by

            frequency.

        """
        # Stopwords are not removed
        unigram_feats_freqs = FreqDist(word for word in words)
        return [
            w
            for w, f in unigram_feats_freqs.most_common(top_n)
            if unigram_feats_freqs[w] > min_freq
        ]

    def bigram_collocation_feats(

        self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi

    ):
        """

        Return `top_n` bigram features (using `assoc_measure`).

        Note that this method is based on bigram collocations measures, and not

        on simple bigram frequency.



        :param documents: a list (or iterable) of tokens.

        :param top_n: number of best words/tokens to use, sorted by association

            measure.

        :param assoc_measure: bigram association measure to use as score function.

        :param min_freq: the minimum number of occurrencies of bigrams to take

            into consideration.



        :return: `top_n` ngrams scored by the given association measure.

        """
        finder = BigramCollocationFinder.from_documents(documents)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(assoc_measure, top_n)

    def classify(self, instance):
        """

        Classify a single instance applying the features that have already been

        stored in the SentimentAnalyzer.



        :param instance: a list (or iterable) of tokens.

        :return: the classification result given by applying the classifier.

        """
        instance_feats = self.apply_features([instance], labeled=False)
        return self.classifier.classify(instance_feats[0])

    def add_feat_extractor(self, function, **kwargs):
        """

        Add a new function to extract features from a document. This function will

        be used in extract_features().

        Important: in this step our kwargs are only representing additional parameters,

        and NOT the document we have to parse. The document will always be the first

        parameter in the parameter list, and it will be added in the extract_features()

        function.



        :param function: the extractor function to add to the list of feature extractors.

        :param kwargs: additional parameters required by the `function` function.

        """
        self.feat_extractors[function].append(kwargs)

    def extract_features(self, document):
        """

        Apply extractor functions (and their parameters) to the present document.

        We pass `document` as the first parameter of the extractor functions.

        If we want to use the same extractor function multiple times, we have to

        add it to the extractors with `add_feat_extractor` using multiple sets of

        parameters (one for each call of the extractor function).



        :param document: the document that will be passed as argument to the

            feature extractor functions.

        :return: A dictionary of populated features extracted from the document.

        :rtype: dict

        """
        all_features = {}
        for extractor in self.feat_extractors:
            for param_set in self.feat_extractors[extractor]:
                feats = extractor(document, **param_set)
            all_features.update(feats)
        return all_features

    def train(self, trainer, training_set, save_classifier=None, **kwargs):
        """

        Train classifier on the training set, optionally saving the output in the

        file specified by `save_classifier`.

        Additional arguments depend on the specific trainer used. For example,

        a MaxentClassifier can use `max_iter` parameter to specify the number

        of iterations, while a NaiveBayesClassifier cannot.



        :param trainer: `train` method of a classifier.

            E.g.: NaiveBayesClassifier.train

        :param training_set: the training set to be passed as argument to the

            classifier `train` method.

        :param save_classifier: the filename of the file where the classifier

            will be stored (optional).

        :param kwargs: additional parameters that will be passed as arguments to

            the classifier `train` function.

        :return: A classifier instance trained on the training set.

        :rtype:

        """
        print("Training classifier")
        self.classifier = trainer(training_set, **kwargs)
        if save_classifier:
            self.save_file(self.classifier, save_classifier)

        return self.classifier

    def save_file(self, content, filename):
        """

        Store `content` in `filename`. Can be used to store a SentimentAnalyzer.

        """
        print("Saving", filename, file=sys.stderr)
        with open(filename, "wb") as storage_file:
            import pickle

            # The protocol=2 parameter is for python2 compatibility
            pickle.dump(content, storage_file, protocol=2)

    def evaluate(

        self,

        test_set,

        classifier=None,

        accuracy=True,

        f_measure=True,

        precision=True,

        recall=True,

        verbose=False,

    ):
        """

        Evaluate and print classifier performance on the test set.



        :param test_set: A list of (tokens, label) tuples to use as gold set.

        :param classifier: a classifier instance (previously trained).

        :param accuracy: if `True`, evaluate classifier accuracy.

        :param f_measure: if `True`, evaluate classifier f_measure.

        :param precision: if `True`, evaluate classifier precision.

        :param recall: if `True`, evaluate classifier recall.

        :return: evaluation results.

        :rtype: dict(str): float

        """
        if classifier is None:
            classifier = self.classifier
        print(f"Evaluating {type(classifier).__name__} results...")
        metrics_results = {}
        if accuracy:
            accuracy_score = eval_accuracy(classifier, test_set)
            metrics_results["Accuracy"] = accuracy_score

        gold_results = defaultdict(set)
        test_results = defaultdict(set)
        labels = set()
        for i, (feats, label) in enumerate(test_set):
            labels.add(label)
            gold_results[label].add(i)
            observed = classifier.classify(feats)
            test_results[observed].add(i)

        for label in labels:
            if precision:
                precision_score = eval_precision(
                    gold_results[label], test_results[label]
                )
                metrics_results[f"Precision [{label}]"] = precision_score
            if recall:
                recall_score = eval_recall(gold_results[label], test_results[label])
                metrics_results[f"Recall [{label}]"] = recall_score
            if f_measure:
                f_measure_score = eval_f_measure(
                    gold_results[label], test_results[label]
                )
                metrics_results[f"F-measure [{label}]"] = f_measure_score

        # Print evaluation results (in alphabetical order)
        if verbose:
            for result in sorted(metrics_results):
                print(f"{result}: {metrics_results[result]}")

        return metrics_results