--- language: ru tags: - russian - text-to-text - PyTorch - Transformers license: apache-2.0 widget: - text: Водка "Русская валюта" премиум люкс 38% 0,25л, Россия pipeline_tag: text2text-generation --- This is a named entity recognizer for goods and brands extraction from receipts of fiscal data operators in Russian. It was developed for the special multi-staged competition devoted to receipt structurization. This competition was organized by [Open Data Science community](https://ods.ai) and [Alpha Bank](https://alfabank.ru), and it was consisted of [the first](https://ods.ai/competitions/nlp-receipts), [the second](https://ods.ai/competitions/alfabank-nlp-receipts-2) and [the final](https://ods.ai/competitions/alfabank-nlp-receipts-final) stage. But this model can be used for any receipt parsing and structurization in Russian. The repository with code for fine-tuning and inference is available on [gitflic.ru](https://gitflic.ru/project/bond005/ods-ner-2023). Example of using: ``` from typing import Tuple import torch from transformers import T5ForConditionalGeneration, GPT2Tokenizer MODEL_NAME = 'bond005/FRED-T5-large-ods-ner-2023' START_TAG = '' END_TAG = '' def initialize_recognizer(model_path: str) -> Tuple[GPT2Tokenizer, T5ForConditionalGeneration]: model = T5ForConditionalGeneration.from_pretrained(model_path) if not torch.cuda.is_available(): raise ValueError('CUDA is not available!') model = model.cuda() model.eval() tokenizer = GPT2Tokenizer.from_pretrained(model_path) return tokenizer, model def recognize(text: str, tokenizer: GPT2Tokenizer, model: T5ForConditionalGeneration) -> Tuple[str, str]: if text.startswith(START_TAG): x = tokenizer(text, return_tensors='pt', padding=True).to(model.device) else: x = tokenizer(START_TAG + text, return_tensors='pt', padding=True).to(model.device) out = model.generate(**x) predictions = tokenizer.decode(out[0], skip_special_tokens=True).strip() while predictions.endswith(END_TAG): predictions = predictions[:-len(END_TAG)].strip() prediction_pair = predictions.split(';') if len(prediction_pair) == 0: goods = '' brands = '' elif len(prediction_pair) == 1: goods = prediction_pair[0].strip() brands = '' else: goods = prediction_pair[0].strip() brands = prediction_pair[1].strip() return goods, brands recognizer = initialize_recognizer(MODEL_NAME) goods_and_brands = recognize(text='Водка "Русская валюта" премиум люкс 38% 0,25л, Россия', tokenizer=recognizer[0], model=recognizer[1]) print(f'GOODS: {goods_and_brands[0]}') # водка print(f'BRANDS: {goods_and_brands[1]}') # русская валюта ```