from transformers import GPT2LMHeadModel, AutoTokenizer import re tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali") model = GPT2LMHeadModel.from_pretrained("./350") model.to("cuda") print(model) BEGIN_TOKEN = "<।summary_begin।>" END_TOKEN = "<।summary_end।>" SUMMARY_TOKEN = "<।summary।>" def processTxt(txt): txt = re.sub(r"।", "। ", txt) txt = re.sub(r",", ", ", txt) txt = re.sub(r"!", "। ", txt) txt = re.sub(r"\?", "। ", txt) txt = re.sub(r"\"", "", txt) txt = re.sub(r"'", "", txt) txt = re.sub(r"’", "", txt) txt = re.sub(r"’", "", txt) txt = re.sub(r"‘", "", txt) txt = re.sub(r";", "। ", txt) txt = re.sub(r"\s+", " ", txt) return txt def index_of(val, in_text, after=0): try: return in_text.index(val, after) except ValueError: return -1 def summarize(txt): txt = processTxt(txt.strip()) txt = "<|SUMMARY_BEGIN|>" + txt + "<|SUMMARY|>" inputs = tokenizer(txt, max_length=800, truncation=True, return_tensors="pt") inputs.to("cuda") output = model.generate(inputs["input_ids"], max_length=len(txt) + 120) txt = tokenizer.batch_decode(output, skip_special_tokens=True)[0] start = index_of(SUMMARY_TOKEN, txt) + len(SUMMARY_TOKEN) print(txt) if start == len(SUMMARY_TOKEN) - 1: return "No Summary!" end = index_of(END_TOKEN, txt, start) if end == -1: end = index_of(SUMMARY_TOKEN, txt, start) if end == -1: end = index_of(BEGIN_TOKEN, txt, start) if end == -1: return txt[start:].strip() txt = txt[start:end].strip() end = index_of(SUMMARY_TOKEN, txt) if end == -1: return txt else: return txt[:end].strip()