File size: 1,466 Bytes
8744085
e48d543
8744085
 
 
a97ba6f
 
 
dbb343d
a97ba6f
 
8744085
 
 
 
 
 
 
 
b748dad
 
 
 
 
 
 
 
b3ecaa7
b748dad
dbb343d
b748dad
 
8744085
 
 
 
 
 
 
 
 
 
bd07b6e
8744085
 
 
 
 
 
b3ecaa7
8744085
 
 
e952967
b3ecaa7
 
c718eb8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from enum import Enum

import pandas as pd


class ColumnNames(Enum):
    LABEL = "label"
    TEXT = "text"
    PROCESSED_TEXT = "processed_text"


class ModelConfigs(Enum):
    NUM_ITERS = 500
    SELECTION_THRESHOLD = 0.0
    PENALTIES = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001]
    MAX_SELECTION = 100_000
    MIN_SELECTION = 10_000


class InputTransformConfigs(Enum):
    NGRAM_RANGE = (1, 3)
    MIN_DF = 0.001
    MAX_DF = 0.75
    SUBLINEAR = True


class PreprocessingConfigs(Enum):
    DEFAULT_PRE = [1, 14, 2, 3, 4, 5, 23, 22, 21, 24]
    DEFAULT_LEMMA = 1
    DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]


class Languages(Enum):
    English = "en_core_web_sm"
    Italian = "it_core_news_sm"
    German = "de_core_news_sm"
    Spanish = "es_core_news_sm"
    Greek = "el_core_news_sm"
    Dutch = "nl_core_news_sm"
    Portuguese = "pt_core_news_sm"
    French = "fr_core_news_sm"
    Danish = "da_core_news_sm"
    # Japanese = "ja_core_news_sm"
    Lithuanian = "lt_core_news_sm"
    Norvegian = "nb_core_news_sm"
    Polish = "pl_core_news_sm"
    Romanian = "ro_core_news_sm"
    Russian = "ru_core_news_sm"
    MultiLanguage = "xx_ent_wiki_sm"
    Chinese = "zh_core_web_sm"


class SupportedFiles(Enum):
    xlsx = (lambda x: pd.read_excel(x, dtype=str),)
    tsv = (lambda x: pd.read_csv(x, dtype=str, sep="\t"),)
    csv = (lambda x: pd.read_csv(x, dtype=str, sep=","),)
    parquet = (lambda x: pd.read_parquet(x),)