File size: 6,568 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import List

from ..pyclass import autoclass

# Wrappers around Lucene classes
JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer')
JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer')
JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer')
JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer')
JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer')
JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer')
JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer')
JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer')
JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer')
JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer')
JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer')
JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer')
JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer')
JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer')
JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer')
JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer')
JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer')
JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer')
JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer')
JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer')
JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer')
JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer')
JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer')
JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet')

# Wrappers around Anserini classes
JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils')
JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer')
JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer')


def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer:
    """Create a Lucene ``Analyzer`` with specific settings.

    Parameters
    ----------
    language : str
        Name of analyzer.
    stemming : bool
        Set to stem.
    stemmer : str
        Stemmer to use.
    stopwords : bool
        Set to filter stopwords.
    huggingFaceTokenizer: str
        a huggingface model id or path to a tokenizer.json file

    Returns
    -------
    JAnalyzer
        Java ``Analyzer`` with specified settings.
    """
    if language.lower() == 'ar':
        return JArabicAnalyzer()
    elif language.lower() == 'bn':
        return JBengaliAnalyzer()
    elif language.lower() in ['zh', 'ko']:
        return JCJKAnalyzer()
    elif language.lower() == 'da':
        return JDanishAnalyzer()
    elif language.lower() == 'nl':
        return JDutchAnalyzer()
    elif language.lower() == 'fi':
        return JFinnishAnalyzer()
    elif language.lower() == 'fr':
        return JFrenchAnalyzer()
    elif language.lower() == 'de':
        return JGermanAnalyzer()
    elif language.lower() == 'hi':
        return JHindiAnalyzer()
    elif language.lower() == 'hu':
        return JHungarianAnalyzer()
    elif language.lower() == 'id':
        return JIndonesianAnalyzer()
    elif language.lower() == 'it':
        return JItalianAnalyzer()
    elif language.lower() == 'ja':
        return JJapaneseAnalyzer()
    elif language.lower() == 'no':
        return JNorwegianAnalyzer()
    elif language.lower() == 'pt':
        return JPortugueseAnalyzer()
    elif language.lower() == 'ru':
        return JRussianAnalyzer()
    elif language.lower() == 'es':
        return JSpanishAnalyzer()
    elif language.lower() == 'te':
        return JTeluguAnalyzer()
    elif language.lower() == 'th':
        return JThaiAnalyzer()
    elif language.lower() == 'tr':
        return JTurkishAnalyzer()
    elif language.lower() == 'tweet':
        return JTweetAnalyzer()
    elif language.lower() == 'hgf_tokenizer':
        return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer)
    elif language.lower() == 'en':
        if stemming:
            if stopwords:
                return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer)
            else:
                return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET)
        else:
            if stopwords:
                return JDefaultEnglishAnalyzer.newNonStemmingInstance()
            else:
                return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET)
    else:
        raise ValueError('Invalid configuration.')


class Analyzer:
    """Python wrapper around a Lucene ``Analyzer`` to simplify analysis.

    Parameters
    ----------
    analyzer : JAnalyzer
        Lucene ``Analyzer``.
    """

    def __init__(self, analyzer):
        if not isinstance(analyzer, JAnalyzer):
            raise TypeError('Invalid JAnalyzer!')
        self.analyzer = analyzer

    def analyze(self, text: str) -> List[str]:
        """Analyze a piece of text.

        Parameters
        ----------
        text : str
            Text to analyze.

        Returns
        -------
        List[str]
            List of tokens corresponding to the output of the analyzer.
        """
        results = JAnalyzerUtils.analyze(self.analyzer, text)
        tokens = []
        for token in results.toArray():
            tokens.append(token)
        return tokens