# Model description - Morphosyntactic analyzer: Stanza - Tagset: NKJP - Embedding vectors: Fasttext (wiki) - Dataset: NLPrePL-NKJP-fair-by-name (https://huggingface.co/datasets/ipipan/nlprepl) # How to use ## Clone ``` git clone git@hf.co:ipipan/nlpre_stanza_nkjp_fasttext_nkjp-by-name ``` ## Load model ``` import stanza lang = 'pl' model_name = 'nlpre_stanza_nkjp_fasttext_nkjp-by-name' prefix = 'nkjpbyname_nkjp' config = \ { # Comma-separated list of processors to use 'processors': 'tokenize,mwt,pos,lemma', # Language code for the language to build the Pipeline in 'lang': lang, # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # You only need model paths if you have a specific model outside of stanza_resources 'tokenize_model_path': os.path.join(model_name, f'{lang}_{prefix}_tokenizer.pt'), 'mwt_model_path': os.path.join(model_name, f'{lang}_{prefix}_mwt_expander.pt'), 'pos_model_path': os.path.join(model_name, f'{lang}_{prefix}_tagger.pt'), 'pos_pretrain_path': os.path.join(model_name, f'{lang}_{prefix}.pretrain.pt'), 'lemma_model_path': os.path.join(model_name, f'{lang}_{prefix}_lemmatizer.pt'), # Use pretokenized text as input and disable tokenization 'tokenize_pretokenized': True } model = stanza.Pipeline(**config)