Instructions to use E-katrin/train100_encoder_freezed_5_10e-4 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use E-katrin/train100_encoder_freezed_5_10e-4 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="E-katrin/train100_encoder_freezed_5_10e-4", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("E-katrin/train100_encoder_freezed_5_10e-4", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from transformers import Pipeline | |
| from src.lemmatize_helper import reconstruct_lemma | |
| class ConlluTokenClassificationPipeline(Pipeline): | |
| def __init__( | |
| self, | |
| model, | |
| tokenizer: callable = None, | |
| sentenizer: callable = None, | |
| **kwargs | |
| ): | |
| super().__init__(model=model, **kwargs) | |
| self.tokenizer = tokenizer | |
| self.sentenizer = sentenizer | |
| #@override | |
| def _sanitize_parameters(self, output_format: str = 'list', **kwargs): | |
| if output_format not in ['list', 'str']: | |
| raise ValueError( | |
| f"output_format must be 'str' or 'list', not {output_format}" | |
| ) | |
| # capture output_format for postprocessing | |
| return {}, {}, {'output_format': output_format} | |
| def preprocess(self, inputs: str) -> dict: | |
| if not isinstance(inputs, str): | |
| raise ValueError("pipeline input must be string (text)") | |
| sentences = [sentence for sentence in self.sentenizer(inputs)] | |
| words = [ | |
| [word for word in self.tokenizer(sentence)] | |
| for sentence in sentences | |
| ] | |
| # stash for later post‐processing | |
| self._texts = sentences | |
| return {"words": words} | |
| def _forward(self, model_inputs: dict) -> dict: | |
| return self.model(**model_inputs, inference_mode=True) | |
| #@override | |
| def postprocess(self, model_outputs: dict, output_format: str) -> list[dict] | str: | |
| sentences = self._decode_model_output(model_outputs) | |
| # Format sentences into CoNLL-U string if requested. | |
| if output_format == 'str': | |
| sentences = self._format_as_conllu(sentences) | |
| return sentences | |
| def _decode_model_output(self, model_outputs: dict) -> list[dict]: | |
| n_sentences = len(model_outputs["words"]) | |
| sentences_decoded = [] | |
| for i in range(n_sentences): | |
| def select_arcs(arcs, batch_idx): | |
| # Select arcs where batch index == batch_idx | |
| # Return tensor of shape [n_selected_arcs, 3] | |
| return arcs[arcs[:, 0] == batch_idx][:, 1:] | |
| # Model outputs are padded tensors, so only leave first `n_words` labels. | |
| n_words = len(model_outputs["words"][i]) | |
| optional_tags = {} | |
| if "lemma_rules" in model_outputs: | |
| optional_tags["lemma_rule_ids"] = model_outputs["lemma_rules"][i, :n_words].tolist() | |
| if "joint_feats" in model_outputs: | |
| optional_tags["joint_feats_ids"] = model_outputs["joint_feats"][i, :n_words].tolist() | |
| if "deps_ud" in model_outputs: | |
| optional_tags["deps_ud"] = select_arcs(model_outputs["deps_ud"], i).tolist() | |
| if "deps_eud" in model_outputs: | |
| optional_tags["deps_eud"] = select_arcs(model_outputs["deps_eud"], i).tolist() | |
| if "miscs" in model_outputs: | |
| optional_tags["misc_ids"] = model_outputs["miscs"][i, :n_words].tolist() | |
| if "deepslots" in model_outputs: | |
| optional_tags["deepslot_ids"] = model_outputs["deepslots"][i, :n_words].tolist() | |
| if "semclasses" in model_outputs: | |
| optional_tags["semclass_ids"] = model_outputs["semclasses"][i, :n_words].tolist() | |
| sentence_decoded = self._decode_sentence( | |
| text=self._texts[i], | |
| words=model_outputs["words"][i], | |
| **optional_tags, | |
| ) | |
| sentences_decoded.append(sentence_decoded) | |
| return sentences_decoded | |
| def _decode_sentence( | |
| self, | |
| text: str, | |
| words: list[str], | |
| lemma_rule_ids: list[int] = None, | |
| joint_feats_ids: list[int] = None, | |
| deps_ud: list[list[int]] = None, | |
| deps_eud: list[list[int]] = None, | |
| misc_ids: list[int] = None, | |
| deepslot_ids: list[int] = None, | |
| semclass_ids: list[int] = None | |
| ) -> dict: | |
| # Enumerate words in the sentence, starting from 1. | |
| ids = self._enumerate_words(words) | |
| result = { | |
| "text": text, | |
| "words": words, | |
| "ids": ids | |
| } | |
| # Decode lemmas. | |
| if lemma_rule_ids: | |
| result["lemmas"] = [ | |
| reconstruct_lemma( | |
| word, | |
| self.model.config.vocabulary["lemma_rule"][lemma_rule_id] | |
| ) | |
| for word, lemma_rule_id in zip(words, lemma_rule_ids, strict=True) | |
| ] | |
| # Decode POS and features. | |
| if joint_feats_ids: | |
| upos, xpos, feats = zip( | |
| *[ | |
| self.model.config.vocabulary["joint_feats"][joint_feats_id].split('#') | |
| for joint_feats_id in joint_feats_ids | |
| ], | |
| strict=True | |
| ) | |
| result["upos"] = list(upos) | |
| result["xpos"] = list(xpos) | |
| result["feats"] = list(feats) | |
| # Decode syntax. | |
| renumerate_and_decode_arcs = lambda arcs, id2rel: [ | |
| ( | |
| # ids stores inverse mapping from internal numeration to the standard | |
| # conllu numeration, so simply use ids[internal_idx] to retrieve token id | |
| # from internal index. | |
| ids[arc_from] if arc_from != arc_to else '0', | |
| ids[arc_to], | |
| id2rel[deprel_id] | |
| ) | |
| for arc_from, arc_to, deprel_id in arcs | |
| ] | |
| if deps_ud: | |
| result["deps_ud"] = renumerate_and_decode_arcs( | |
| deps_ud, | |
| self.model.config.vocabulary["ud_deprel"] | |
| ) | |
| if deps_eud: | |
| result["deps_eud"] = renumerate_and_decode_arcs( | |
| deps_eud, | |
| self.model.config.vocabulary["eud_deprel"] | |
| ) | |
| # Decode misc. | |
| if misc_ids: | |
| result["miscs"] = [ | |
| self.model.config.vocabulary["misc"][misc_id] | |
| for misc_id in misc_ids | |
| ] | |
| # Decode semantics. | |
| if deepslot_ids: | |
| result["deepslots"] = [ | |
| self.model.config.vocabulary["deepslot"][deepslot_id] | |
| for deepslot_id in deepslot_ids | |
| ] | |
| if semclass_ids: | |
| result["semclasses"] = [ | |
| self.model.config.vocabulary["semclass"][semclass_id] | |
| for semclass_id in semclass_ids | |
| ] | |
| return result | |
| def _enumerate_words(words: list[str]) -> list[str]: | |
| ids = [] | |
| current_id = 0 | |
| current_null_count = 0 | |
| for word in words: | |
| if word == "#NULL": | |
| current_null_count += 1 | |
| ids.append(f"{current_id}.{current_null_count}") | |
| else: | |
| current_id += 1 | |
| current_null_count = 0 | |
| ids.append(f"{current_id}") | |
| return ids | |
| def _format_as_conllu(sentences: list[dict]) -> str: | |
| """ | |
| Format a list of sentence dicts into a CoNLL-U formatted string. | |
| """ | |
| formatted = [] | |
| for sentence in sentences: | |
| # The first line is a text matadata. | |
| lines = [f"# text = {sentence['text']}"] | |
| id2idx = {token_id: idx for idx, token_id in enumerate(sentence['ids'])} | |
| # Basic syntax. | |
| heads = [''] * len(id2idx) | |
| deprels = [''] * len(id2idx) | |
| if "deps_ud" in sentence: | |
| for arc_from, arc_to, deprel in sentence['deps_ud']: | |
| token_idx = id2idx[arc_to] | |
| heads[token_idx] = arc_from | |
| deprels[token_idx] = deprel | |
| # Enhanced syntax. | |
| deps_dicts = [{} for _ in range(len(id2idx))] | |
| if "deps_eud" in sentence: | |
| for arc_from, arc_to, deprel in sentence['deps_eud']: | |
| token_idx = id2idx[arc_to] | |
| deps_dicts[token_idx][arc_from] = deprel | |
| for idx, token_id in enumerate(sentence['ids']): | |
| word = sentence['words'][idx] | |
| lemma = sentence['lemmas'][idx] if "lemmas" in sentence else '' | |
| upos = sentence['upos'][idx] if "upos" in sentence else '' | |
| xpos = sentence['xpos'][idx] if "xpos" in sentence else '' | |
| feats = sentence['feats'][idx] if "feats" in sentence else '' | |
| deps = '|'.join(f"{head}:{rel}" for head, rel in deps_dicts[idx].items()) or '_' | |
| misc = sentence['miscs'][idx] if "miscs" in sentence else '' | |
| deepslot = sentence['deepslots'][idx] if "deepslots" in sentence else '' | |
| semclass = sentence['semclasses'][idx] if "semclasses" in sentence else '' | |
| # CoNLL-U columns | |
| line = '\t'.join([ | |
| token_id, word, lemma, upos, xpos, feats, heads[idx], | |
| deprels[idx], deps, misc, deepslot, semclass | |
| ]) | |
| lines.append(line) | |
| formatted.append('\n'.join(lines)) | |
| return '\n\n'.join(formatted) |