wordify / src /configs.py
Pietro Lesci
add support for chinese
b3ecaa7
from enum import Enum
import pandas as pd
class ColumnNames(Enum):
LABEL = "label"
TEXT = "text"
PROCESSED_TEXT = "processed_text"
class ModelConfigs(Enum):
NUM_ITERS = 500
SELECTION_THRESHOLD = 0.0
PENALTIES = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001, 0.00001]
MAX_SELECTION = 100_000
MIN_SELECTION = 10_000
class InputTransformConfigs(Enum):
NGRAM_RANGE = (1, 3)
MIN_DF = 0.001
MAX_DF = 0.75
SUBLINEAR = True
class PreprocessingConfigs(Enum):
DEFAULT_PRE = [1, 14, 2, 3, 4, 5, 23, 22, 21, 24]
DEFAULT_LEMMA = 1
DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
class Languages(Enum):
English = "en_core_web_sm"
Italian = "it_core_news_sm"
German = "de_core_news_sm"
Spanish = "es_core_news_sm"
Greek = "el_core_news_sm"
Dutch = "nl_core_news_sm"
Portuguese = "pt_core_news_sm"
French = "fr_core_news_sm"
Danish = "da_core_news_sm"
# Japanese = "ja_core_news_sm"
Lithuanian = "lt_core_news_sm"
Norvegian = "nb_core_news_sm"
Polish = "pl_core_news_sm"
Romanian = "ro_core_news_sm"
Russian = "ru_core_news_sm"
MultiLanguage = "xx_ent_wiki_sm"
Chinese = "zh_core_web_sm"
class SupportedFiles(Enum):
xlsx = (lambda x: pd.read_excel(x, dtype=str),)
tsv = (lambda x: pd.read_csv(x, dtype=str, sep="\t"),)
csv = (lambda x: pd.read_csv(x, dtype=str, sep=","),)
parquet = (lambda x: pd.read_parquet(x),)