File size: 4,050 Bytes

2fddad0

from typing import Dict
import numpy as np
import torch
from transformers import Pipeline
from transformers.utils import ModelOutput
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import AutoModelForSeq2SeqLM
from huggingface_hub import Repository

SAHIDIC_TAG = "з"
BOHAIRIC_TAG = "б"

from transformers import GenerationConfig

GENERATION_CONFIG = GenerationConfig(
    max_length=20,
    max_new_tokens=128,
    min_new_tokens=1,
    min_length=0,
    early_stopping=True,
    do_sample=True,
    num_beams=5,
    num_beam_groups=1,
    top_k=50,
    top_p=0.95,
    temperature=1.0,
    diversity_penalty=0.0,
    output_scores=True,
    return_dict_in_generate=True,
)


class CopticEnglishPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "from_bohairic" in kwargs and kwargs["from_bohairic"]:
            preprocess_kwargs["from_bohairic"] = True
        forward_kwargs = {}
        if "output_confidence" in kwargs and kwargs["output_confidence"]:
            forward_kwargs["output_confidence"] = True

        return preprocess_kwargs, forward_kwargs, {}

    def preprocess(self, text, from_bohairic=False):
        text = greekify(text.lower())

        if from_bohairic:
            text = f"{BOHAIRIC_TAG} {text}"
        else:
            text = f"{SAHIDIC_TAG} {text}"

        return self.tokenizer.encode(text, return_tensors="pt")

    def _forward(self, input_tensors, output_confidence=False) -> ModelOutput:
        outputs = self.model.generate(
            input_tensors[:, : self.tokenizer.model_max_length],
            generation_config=GENERATION_CONFIG,
        )

        translated_text = self.tokenizer.decode(
            outputs.sequences[0], skip_special_tokens=True
        )

        if output_confidence:
            scores = outputs.scores
            confidences = [
                torch.softmax(score, dim=-1).max().item() for score in scores
            ]
            num_words = len(translated_text.split())
            # scale the predicition probability by the number of words in the sentence
            scaled_probability = np.exp(sum(np.log(confidences)) / num_words)
            return translated_text, scaled_probability

        return translated_text, None

    def postprocess(self, outputs):
        text, confidence = outputs
        if confidence is None:
            return {
                "translation": text,
            }
        return {
            "translation": text,
            "confidence": confidence,
        }


COPTIC_TO_GREEK = {
    "ⲁ": "α",
    "ⲃ": "β",
    "ⲅ": "γ",
    "ⲇ": "δ",
    "ⲉ": "ε",
    "ⲋ": "ϛ",
    "ⲍ": "ζ",
    "ⲏ": "η",
    "ⲑ": "θ",
    "ⲓ": "ι",
    "ⲕ": "κ",
    "ⲗ": "λ",
    "ⲙ": "μ",
    "ⲛ": "ν",
    "ⲝ": "ξ",
    "ⲟ": "ο",
    "ⲡ": "π",
    "ⲣ": "ρ",
    "ⲥ": "σ",
    "ⲧ": "τ",
    "ⲩ": "υ",
    "ⲫ": "φ",
    "ⲭ": "χ",
    "ⲯ": "ψ",
    "ⲱ": "ω",
    "ϣ": "s",
    "ϥ": "f",
    "ϧ": "k",
    "ϩ": "h",
    "ϫ": "j",
    "ϭ": "c",
    "ϯ": "t",
}


def greekify(coptic_text):
    chars = []
    for c in coptic_text:
        l_c = c.lower()
        chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
    return "".join(chars)


if __name__ == "__main__":
    PIPELINE_REGISTRY.register_pipeline(
        "coptic-english-translation",
        pipeline_class=CopticEnglishPipeline,
        pt_model=AutoModelForSeq2SeqLM,
        default={"pt": "megalaa/mul-cop-en-norm-group-greekified"},
        type="text",
    )

    classifier = pipeline(
        "coptic-english-translation", model="megalaa/mul-cop-en-norm-group-greekified"
    )
    print(classifier("ⲛⲧⲟϥ ⲡⲉ ⲓⲏⲥⲟⲩⲥ ⲡⲉⲭⲣⲓⲥⲧⲟⲥ", from_bohairic=False, output_confidence=True))

    repo = Repository(
        "cop-eng-translation",
        clone_from="megalaa/mul-cop-en-norm-group-greekified",
    )
    classifier.save_pretrained("cop-eng-translation")