File size: 6,207 Bytes

from typing import Dict, List, Any
import sys, os, re
from tqdm import tqdm

import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from IndicTransTokenizer.utils import preprocess_batch, postprocess_batch
from IndicTransTokenizer.tokenizer import IndicTransTokenizer


class EndpointHandler():
    def __init__(self, direction = "en-indic", quantization = ""):
        self.model_name = "ai4bharat/indictrans2-en-indic-1B"

        self.utterance_pattern = re.compile(r"^\d+$")
        self.timestamp_pattern = re.compile(r"(\d+:\d+:\d+,\d+)\s*-->\s*(\d+:\d+:\d+,\d+)")

        self.BATCH_SIZE = 16
        self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

        self.model = None
        self.tokenizer = None

        if quantization == "4-bit":
            qconfig = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
            )
        elif quantization == "8-bit":
            qconfig = BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_use_double_quant=True,
                bnb_8bit_compute_dtype=torch.bfloat16,
            )
        else:
            qconfig = None

        self.tokenizer = IndicTransTokenizer(direction=direction)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(
            self.model_name,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            quantization_config=qconfig
        )

        if qconfig==None:
            self.model = self.model.to(self.DEVICE)
            self.model.half()

        self.model.eval()


    def batch_translate(self, input_sentences, src_lang, tgt_lang):
        translations = []
        for i in range(0, len(input_sentences), self.BATCH_SIZE):
            batch = input_sentences[i : i + self.BATCH_SIZE]

            # Preprocess the batch and extract entity mappings
            batch, entity_map = preprocess_batch(
                batch, src_lang=src_lang, tgt_lang=tgt_lang
            )

            # Tokenize the batch and generate input encodings
            inputs = self.tokenizer(
                batch,
                src=True,
                truncation=True,
                padding="longest",
                return_tensors="pt",
                return_attention_mask=True,
            ).to(self.DEVICE)

            # Generate translations using the model
            with torch.no_grad():
                generated_tokens = self.model.generate(
                    **inputs,
                    use_cache=True,
                    min_length=0,
                    max_length=256,
                    num_beams=5,
                    num_return_sequences=1,
                )

            # Decode the generated tokens into text
            generated_tokens = self.tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(), src=False
            )

            # Postprocess the translations, including entity replacement
            translations += postprocess_batch(
                generated_tokens, lang=tgt_lang, placeholder_entity_map=entity_map
            )

            del inputs
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        return translations


    def read_srt(self, srt_path):
        data = []
        with open(srt_path, 'r', encoding='utf-8') as fp:
            utterance_ind = ""
            start_end = ""
            text = ""
            for ind, line in enumerate(fp.readlines()):
                line = line.strip()
                if re.search(self.utterance_pattern, line) is not None:
                    utterance_ind = line
                elif re.search(self.timestamp_pattern, line) is not None:
                    start_end = line
                else:
                    text += line

                if utterance_ind!='' and start_end!='' and text!='':
                    data.append({'utterance_ind': utterance_ind, 'start_end': start_end, 'text': text})
                    utterance_ind = ''
                    start_end = ''
                    text = ''

        return data

    def test(self, inputs) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: (transcript_path : 'str', src_lang : 'str', tgt_lang : 'str')
            kwargs
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """

        src_lang = inputs["src_lang"]
        tgt_lang = inputs["tgt_lang"]
        transcript_path = inputs["transcript_path"]

        output_translations = []
        if self.model is not None:
            transcriptions = self.read_srt(transcript_path)
            trans_sents = [entry['text'] for entry in transcriptions]
            indic_translations = self.batch_translate(trans_sents, src_lang, tgt_lang)

            for i in tqdm(range(len(transcriptions))):
                entry = transcriptions[i]
                entry['text'] = indic_translations[i]
                output_translations.append(entry)

            return output_translations
        else:
            return []

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: (transcript_path : 'str', src_lang : 'str', tgt_lang : 'str')
            kwargs
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """

        inputs = data.pop("inputs",data)

        src_lang = inputs["src_lang"]
        tgt_lang = inputs["tgt_lang"]
        transcript_path = inputs["transcript_path"]

        output_translations = []
        if self.model is not None:
            transcriptions = self.read_srt(transcript_path)
            trans_sents = [entry['text'] for entry in transcriptions]
            indic_translations = self.batch_translate(trans_sents, src_lang, tgt_lang)

            for i in tqdm(range(len(transcriptions))):
                entry = transcriptions[i]
                entry['text'] = indic_translations[i]
                output_translations.append(entry)

            return output_translations
        else:
            return []