bond005/FRED-T5-large-ods-ner-2023

This is a named entity recognizer for goods and brands extraction from receipts of fiscal data operators in Russian.

It was developed for the special multi-staged competition devoted to receipt structurization. This competition was organized by Open Data Science community and Alpha Bank, and it was consisted of the first, the second and the final stage. But this model can be used for any receipt parsing and structurization in Russian. The repository with code for fine-tuning and inference is available on gitflic.ru.

Example of using:

from typing import Tuple
import torch
from transformers import T5ForConditionalGeneration, GPT2Tokenizer


MODEL_NAME = 'bond005/FRED-T5-large-ods-ner-2023'
START_TAG = '<LM>'
END_TAG = '</s>'


def initialize_recognizer(model_path: str) -> Tuple[GPT2Tokenizer, T5ForConditionalGeneration]:
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    if not torch.cuda.is_available():
        raise ValueError('CUDA is not available!')
    model = model.cuda()
    model.eval()
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    return tokenizer, model


def recognize(text: str, tokenizer: GPT2Tokenizer, model: T5ForConditionalGeneration) -> Tuple[str, str]:
    if text.startswith(START_TAG):
        x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
    else:
        x = tokenizer(START_TAG + text, return_tensors='pt', padding=True).to(model.device)
    out = model.generate(**x)
    predictions = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    while predictions.endswith(END_TAG):
        predictions = predictions[:-len(END_TAG)].strip()
    prediction_pair = predictions.split(';')
    if len(prediction_pair) == 0:
        goods = ''
        brands = ''
    elif len(prediction_pair) == 1:
        goods = prediction_pair[0].strip()
        brands = ''
    else:
        goods = prediction_pair[0].strip()
        brands = prediction_pair[1].strip()
    return goods, brands


recognizer = initialize_recognizer(MODEL_NAME)

goods_and_brands = recognize(text='Водка "Русская валюта" премиум люкс 38% 0,25л, Россия',
                             tokenizer=recognizer[0], model=recognizer[1])

print(f'GOODS: {goods_and_brands[0]}')
# водка

print(f'BRANDS: {goods_and_brands[1]}')
# русская валюта