faridulreza commited on
Commit
ecb30cf
1 Parent(s): 9f6ad07

Upload 6 files

Browse files
Files changed (6) hide show
  1. config.json +38 -0
  2. generation_config.json +6 -0
  3. model.py +74 -0
  4. pytorch_model.bin +3 -0
  5. server.py +36 -0
  6. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./save_model_7",
3
+ "activation_function": "gelu_new",
4
+ "architectures": ["GPT2LMHeadModel"],
5
+ "attn_pdrop": 0.0,
6
+ "bos_token_id": 50256,
7
+ "embd_pdrop": 0.0,
8
+ "eos_token_id": 50256,
9
+ "gradient_checkpointing": false,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-5,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.0,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 50
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.26.1",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.26.1"
6
+ }
model.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2LMHeadModel, AutoTokenizer
2
+ import re
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali")
5
+ model = GPT2LMHeadModel.from_pretrained("./350")
6
+
7
+ model.to("cuda")
8
+
9
+ print(model)
10
+
11
+ BEGIN_TOKEN = "<।summary_begin।>"
12
+ END_TOKEN = "<।summary_end।>"
13
+ SUMMARY_TOKEN = "<।summary।>"
14
+
15
+
16
+ def processTxt(txt):
17
+ txt = re.sub(r"।", "। ", txt)
18
+ txt = re.sub(r",", ", ", txt)
19
+ txt = re.sub(r"!", "। ", txt)
20
+ txt = re.sub(r"\?", "। ", txt)
21
+ txt = re.sub(r"\"", "", txt)
22
+ txt = re.sub(r"'", "", txt)
23
+ txt = re.sub(r"’", "", txt)
24
+ txt = re.sub(r"’", "", txt)
25
+ txt = re.sub(r"‘", "", txt)
26
+ txt = re.sub(r";", "। ", txt)
27
+
28
+ txt = re.sub(r"\s+", " ", txt)
29
+
30
+ return txt
31
+
32
+
33
+ def index_of(val, in_text, after=0):
34
+ try:
35
+ return in_text.index(val, after)
36
+ except ValueError:
37
+ return -1
38
+
39
+
40
+ def summarize(txt):
41
+ txt = processTxt(txt.strip())
42
+ txt = "<|SUMMARY_BEGIN|>" + txt + "<|SUMMARY|>"
43
+
44
+ inputs = tokenizer(txt, max_length=800, truncation=True, return_tensors="pt")
45
+ inputs.to("cuda")
46
+ output = model.generate(inputs["input_ids"], max_length=len(txt) + 120)
47
+
48
+ txt = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
49
+
50
+ start = index_of(SUMMARY_TOKEN, txt) + len(SUMMARY_TOKEN)
51
+
52
+ print(txt)
53
+ if start == len(SUMMARY_TOKEN) - 1:
54
+ return "No Summary!"
55
+
56
+ end = index_of(END_TOKEN, txt, start)
57
+
58
+ if end == -1:
59
+ end = index_of(SUMMARY_TOKEN, txt, start)
60
+
61
+ if end == -1:
62
+ end = index_of(BEGIN_TOKEN, txt, start)
63
+
64
+ if end == -1:
65
+ return txt[start:].strip()
66
+
67
+ txt = txt[start:end].strip()
68
+
69
+ end = index_of(SUMMARY_TOKEN, txt)
70
+
71
+ if end == -1:
72
+ return txt
73
+ else:
74
+ return txt[:end].strip()
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdc50ee66a8dba2a224c595a373a0b94f4ce9cfdc050a4847bf961594ee29cef
3
+ size 510398013
server.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run this file on server to return a Concentration Index (CI).
2
+ # Analysis is in 'Util' folder.
3
+
4
+ import base64
5
+ import io
6
+ import sys
7
+
8
+ from flask import (Flask, Response, jsonify, json, make_response, render_template,
9
+ request, send_file, send_from_directory)
10
+ from flask_cors import CORS, cross_origin
11
+
12
+ from model import summarize
13
+
14
+ app = Flask(__name__)
15
+ cors = CORS(app, resources={r'/*': {"origins": '*'}})
16
+ app.config['CORS_HEADER'] = 'Content-Type'
17
+
18
+
19
+ @app.route('/')
20
+ def index():
21
+ return "Hello World!"
22
+
23
+
24
+ @app.route('/summarize/', methods=['POST'])
25
+ @cross_origin(origin='*', headers=['Content-Type'])
26
+ def getSummary():
27
+ data = request.get_json(force=True)
28
+
29
+ summary = summarize(data['text'])
30
+ response = make_response({"summary": summary})
31
+ response.headers.set('Content-Type', 'application/json')
32
+ return response
33
+
34
+
35
+ if __name__ == '__main__':
36
+ app.run(host='0.0.0.0', debug=True, port=5000)
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54fa4212c860b24a2f8df6215464f4a6dc98f5bfc673423e77c451d800edc0e
3
+ size 3451