faridulreza
/

gpt2-bangla-summurizer

text-generation

Inference Endpoints

text-generation-inference

Model card Files Files and versions Community

faridulreza commited on Aug 28, 2023

Commit

65d76f3

•

1 Parent(s): 3029111

Update README.md

Files changed (1) hide show

README.md +9 -8

README.md CHANGED Viewed

@@ -32,10 +32,11 @@ model = GPT2LMHeadModel.from_pretrained("faridulreza/gpt2-bangla-summurizer")
 model.to("cuda")
 BEGIN_TOKEN = "<।summary_begin।>"
-END_TOKEN = "<।summary_end।>"
 SUMMARY_TOKEN = "<।summary।>"
 def processTxt(txt):
     txt = re.sub(r"।", "। ", txt)
     txt = re.sub(r",", ", ", txt)
@@ -59,26 +60,26 @@ def index_of(val, in_text, after=0):
     except ValueError:
         return -1
 def summarize(txt):
     txt = processTxt(txt.strip())
-    txt = "<|SUMMARY_BEGIN|>" + txt + "<|SUMMARY|>"
     inputs = tokenizer(txt, max_length=800, truncation=True, return_tensors="pt")
     inputs.to("cuda")
-    output = model.generate(inputs["input_ids"], max_length=len(txt) + 120)
     txt = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
     start = index_of(SUMMARY_TOKEN, txt) + len(SUMMARY_TOKEN)
     if start == len(SUMMARY_TOKEN) - 1:
         return "No Summary!"
     end = index_of(END_TOKEN, txt, start)
     if end == -1:
-        end = index_of(SUMMARY_TOKEN, txt, start)
     if end == -1:
         end = index_of(BEGIN_TOKEN, txt, start)
@@ -88,7 +89,7 @@ def summarize(txt):
     txt = txt[start:end].strip()
-    end = index_of(SUMMARY_TOKEN, txt)
     if end == -1:
         return txt

 model.to("cuda")
 BEGIN_TOKEN = "<।summary_begin।>"
+END_TOKEN = " <।summary_end।>"
+BEGIN_TOKEN_ALT = "<।sum_begin।>"
+END_TOKEN_ALT = " <।sum_end।>"
 SUMMARY_TOKEN = "<।summary।>"
 def processTxt(txt):
     txt = re.sub(r"।", "। ", txt)
     txt = re.sub(r",", ", ", txt)
     except ValueError:
         return -1
 def summarize(txt):
     txt = processTxt(txt.strip())
+    txt = BEGIN_TOKEN + txt + SUMMARY_TOKEN
     inputs = tokenizer(txt, max_length=800, truncation=True, return_tensors="pt")
     inputs.to("cuda")
+    output = model.generate(inputs["input_ids"], max_length=len(txt) + 220, pad_token_id=tokenizer.eos_token_id)
     txt = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
     start = index_of(SUMMARY_TOKEN, txt) + len(SUMMARY_TOKEN)
+    print("Whole text completion: \n",txt)
     if start == len(SUMMARY_TOKEN) - 1:
         return "No Summary!"
     end = index_of(END_TOKEN, txt, start)
     if end == -1:
+        end = index_of(END_TOKEN_ALT, txt, start)
     if end == -1:
         end = index_of(BEGIN_TOKEN, txt, start)
     txt = txt[start:end].strip()
+    end = index_of(SUMMARY_TOKEN,txt)
     if end == -1:
         return txt