faridulreza commited on
Commit
65d76f3
1 Parent(s): 3029111

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -8
README.md CHANGED
@@ -32,10 +32,11 @@ model = GPT2LMHeadModel.from_pretrained("faridulreza/gpt2-bangla-summurizer")
32
  model.to("cuda")
33
 
34
  BEGIN_TOKEN = "<।summary_begin।>"
35
- END_TOKEN = "<।summary_end।>"
 
 
36
  SUMMARY_TOKEN = "<।summary।>"
37
 
38
-
39
  def processTxt(txt):
40
  txt = re.sub(r"।", "। ", txt)
41
  txt = re.sub(r",", ", ", txt)
@@ -59,26 +60,26 @@ def index_of(val, in_text, after=0):
59
  except ValueError:
60
  return -1
61
 
62
-
63
  def summarize(txt):
64
  txt = processTxt(txt.strip())
65
- txt = "<|SUMMARY_BEGIN|>" + txt + "<|SUMMARY|>"
66
 
67
  inputs = tokenizer(txt, max_length=800, truncation=True, return_tensors="pt")
68
  inputs.to("cuda")
69
- output = model.generate(inputs["input_ids"], max_length=len(txt) + 120)
 
70
  txt = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
71
 
72
  start = index_of(SUMMARY_TOKEN, txt) + len(SUMMARY_TOKEN)
73
 
74
-
75
  if start == len(SUMMARY_TOKEN) - 1:
76
  return "No Summary!"
77
 
78
  end = index_of(END_TOKEN, txt, start)
79
 
80
  if end == -1:
81
- end = index_of(SUMMARY_TOKEN, txt, start)
82
 
83
  if end == -1:
84
  end = index_of(BEGIN_TOKEN, txt, start)
@@ -88,7 +89,7 @@ def summarize(txt):
88
 
89
  txt = txt[start:end].strip()
90
 
91
- end = index_of(SUMMARY_TOKEN, txt)
92
 
93
  if end == -1:
94
  return txt
 
32
  model.to("cuda")
33
 
34
  BEGIN_TOKEN = "<।summary_begin।>"
35
+ END_TOKEN = " <।summary_end।>"
36
+ BEGIN_TOKEN_ALT = "<।sum_begin।>"
37
+ END_TOKEN_ALT = " <।sum_end।>"
38
  SUMMARY_TOKEN = "<।summary।>"
39
 
 
40
  def processTxt(txt):
41
  txt = re.sub(r"।", "। ", txt)
42
  txt = re.sub(r",", ", ", txt)
 
60
  except ValueError:
61
  return -1
62
 
 
63
  def summarize(txt):
64
  txt = processTxt(txt.strip())
65
+ txt = BEGIN_TOKEN + txt + SUMMARY_TOKEN
66
 
67
  inputs = tokenizer(txt, max_length=800, truncation=True, return_tensors="pt")
68
  inputs.to("cuda")
69
+ output = model.generate(inputs["input_ids"], max_length=len(txt) + 220, pad_token_id=tokenizer.eos_token_id)
70
+
71
  txt = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
72
 
73
  start = index_of(SUMMARY_TOKEN, txt) + len(SUMMARY_TOKEN)
74
 
75
+ print("Whole text completion: \n",txt)
76
  if start == len(SUMMARY_TOKEN) - 1:
77
  return "No Summary!"
78
 
79
  end = index_of(END_TOKEN, txt, start)
80
 
81
  if end == -1:
82
+ end = index_of(END_TOKEN_ALT, txt, start)
83
 
84
  if end == -1:
85
  end = index_of(BEGIN_TOKEN, txt, start)
 
89
 
90
  txt = txt[start:end].strip()
91
 
92
+ end = index_of(SUMMARY_TOKEN,txt)
93
 
94
  if end == -1:
95
  return txt