megalaa commited on
Commit
d33d554
1 Parent(s): 6d017d4

Upload 11 files

Browse files
args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "hf/fifth_attempt-norm_group_greekified-finetuned", "src_language": "cop", "tgt_language": "eng", "max_input_length": 128, "max_target_length": 128}
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Helsinki-NLP/opus-mt-mul-en",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bad_words_ids": [
12
+ [
13
+ 64171
14
+ ]
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classif_dropout": 0.0,
18
+ "classifier_dropout": 0.0,
19
+ "d_model": 512,
20
+ "decoder_attention_heads": 8,
21
+ "decoder_ffn_dim": 2048,
22
+ "decoder_layerdrop": 0.0,
23
+ "decoder_layers": 6,
24
+ "decoder_start_token_id": 64171,
25
+ "decoder_vocab_size": 64172,
26
+ "dropout": 0.1,
27
+ "encoder_attention_heads": 8,
28
+ "encoder_ffn_dim": 2048,
29
+ "encoder_layerdrop": 0.0,
30
+ "encoder_layers": 6,
31
+ "eos_token_id": 0,
32
+ "extra_pos_embeddings": 64172,
33
+ "forced_eos_token_id": 0,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1",
37
+ "2": "LABEL_2"
38
+ },
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1,
44
+ "LABEL_2": 2
45
+ },
46
+ "max_length": 512,
47
+ "max_position_embeddings": 512,
48
+ "model_type": "marian",
49
+ "normalize_before": false,
50
+ "normalize_embedding": false,
51
+ "num_beams": 6,
52
+ "num_hidden_layers": 6,
53
+ "pad_token_id": 64171,
54
+ "scale_embedding": true,
55
+ "share_encoder_decoder_embeddings": true,
56
+ "static_position_embeddings": true,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.33.1",
59
+ "use_cache": true,
60
+ "vocab_size": 64172
61
+ }
config.properties ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models={\
2
+ "cop-en-norm-group-greekified": {\
3
+ "1.0": {\
4
+ "defaultVersion": true,\
5
+ "marName": "cop-en-norm-group-greekified.mar",\
6
+ "minWorkers": 1,\
7
+ "maxWorkers": 4,\
8
+ "batchSize": 1,\
9
+ "maxBatchDelay": 100,\
10
+ "responseTimeout": 120\
11
+ }\
12
+ }\
13
+ }
handler.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import logging
3
+ import os
4
+ from abc import ABC
5
+ from typing import Optional
6
+
7
+ import torch
8
+ import json
9
+
10
+ from transformers import (
11
+ AutoModelForSeq2SeqLM,
12
+ AutoTokenizer,
13
+ )
14
+ from ts.torch_handler.base_handler import BaseHandler
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ MAX_TOKEN_LENGTH_ERR = {
19
+ "code": 422,
20
+ "type" : "MaxTokenLengthError",
21
+ "message": "Max token length exceeded",
22
+ }
23
+
24
+ class CopEngHandler(BaseHandler, ABC):
25
+
26
+ @dataclass
27
+ class GenerationConfig:
28
+ max_length: int = 20
29
+ max_new_tokens: Optional[int] = None
30
+ min_length: int = 0
31
+ min_new_tokens: Optional[int] = None
32
+ early_stopping: bool = True
33
+ do_sample: bool = False
34
+ num_beams: int = 1
35
+ num_beam_groups: int = 1
36
+ top_k: int = 50
37
+ top_p: float = 0.95
38
+ temperature: float = 1.0
39
+ diversity_penalty: float = 0.0
40
+
41
+ def __init__(self):
42
+ super(CopEngHandler, self).__init__()
43
+ self.initialized = False
44
+
45
+ def initialize(self, ctx):
46
+ """In this initialize function, the HF large model is loaded and
47
+ partitioned using DeepSpeed.
48
+ Args:
49
+ ctx (context): It is a JSON Object containing information
50
+ pertaining to the model artifacts parameters.
51
+ """
52
+ logger.info("Start initialize")
53
+ self.manifest = ctx.manifest
54
+ properties = ctx.system_properties
55
+ model_dir = properties.get("model_dir")
56
+ serialized_file = self.manifest["model"]["serializedFile"]
57
+ model_pt_path = os.path.join(model_dir, serialized_file)
58
+
59
+ setup_config_path = os.path.join(model_dir, "setup_self.config.json")
60
+ if os.path.isfile(setup_config_path):
61
+ with open(setup_config_path) as setup_config_path:
62
+ self.setup_config = json.load(setup_config_path)
63
+
64
+ seed = int(42)
65
+ torch.manual_seed(seed)
66
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
67
+ logger.info("Device: %s", self.device)
68
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
69
+ self.model.to(self.device)
70
+ self.model.eval()
71
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
72
+ self.config = CopEngHandler.GenerationConfig(
73
+ max_new_tokens=128,
74
+ min_new_tokens=1,
75
+ num_beams=5,
76
+ )
77
+ self.initialized = True
78
+ logger.info("Init done")
79
+
80
+
81
+
82
+ def preprocess(self, requests):
83
+ preprocessed_data = []
84
+ for data in requests:
85
+ data_item = data.get("data")
86
+ if data_item is None:
87
+ data_item = data.get("body")
88
+ if isinstance(data_item, (bytes, bytearray)):
89
+ data_item = data_item.decode("utf-8")
90
+ preprocessed_data.append(greekify(data_item))
91
+ logger.info("preprocessed_data %s: ", preprocessed_data)
92
+ return preprocessed_data
93
+
94
+ def inference(self, data):
95
+ indices = {}
96
+ batch = []
97
+ for i, item in enumerate(data):
98
+ tokens = self.tokenizer(item, return_tensors="pt", padding=True)
99
+ if len(tokens.input_ids.squeeze()) > self.tokenizer.model_max_length:
100
+ logger.info("Skipping token %s for index %s", tokens, i)
101
+ continue
102
+ indices[i] = len(batch)
103
+ batch.append(data[i])
104
+ logger.info("inference batch: %s", batch)
105
+ result = self.batch_translate(batch)
106
+ return [result[indices[i]] if i in indices else None for i in range(len(data))]
107
+
108
+ def postprocess(self, output):
109
+ return output
110
+
111
+ def handle(self, requests, context):
112
+ preprocessed = self.preprocess(requests)
113
+ inference_data = self.inference(preprocessed)
114
+ postprocessed = self.postprocess(inference_data)
115
+ logger.info("inference result: %s", postprocessed)
116
+
117
+ responses = [
118
+ {"code": 200, "translation": translation}
119
+ if translation
120
+ else MAX_TOKEN_LENGTH_ERR
121
+ for translation in postprocessed
122
+ ]
123
+
124
+ return responses
125
+
126
+ def batch_translate(self, input_sentences, output_confidence=False):
127
+ if len(input_sentences) == 0:
128
+ return []
129
+ inputs = self.tokenizer(input_sentences, return_tensors="pt", padding=True).to(
130
+ self.device
131
+ )
132
+ output_scores, return_dict_in_generate = output_confidence, output_confidence
133
+ outputs = self.model.generate(
134
+ **inputs,
135
+ max_length=self.config.max_length,
136
+ max_new_tokens=self.config.max_new_tokens,
137
+ min_length=self.config.min_length,
138
+ min_new_tokens=self.config.min_new_tokens,
139
+ early_stopping=self.config.early_stopping,
140
+ do_sample=self.config.do_sample,
141
+ num_beams=self.config.num_beams,
142
+ num_beam_groups=self.config.num_beam_groups,
143
+ top_k=self.config.top_k,
144
+ top_p=self.config.top_p,
145
+ temperature=self.config.temperature,
146
+ diversity_penalty=self.config.diversity_penalty,
147
+ output_scores=output_scores,
148
+ return_dict_in_generate=True,
149
+ )
150
+ translated_text = self.tokenizer.batch_decode(
151
+ outputs.sequences, skip_special_tokens=True
152
+ )
153
+ return translated_text
154
+
155
+
156
+ COPTIC_TO_GREEK = {
157
+ "ⲁ": "α",
158
+ "ⲃ": "β",
159
+ "ⲅ": "γ",
160
+ "ⲇ": "δ",
161
+ "ⲉ": "ε",
162
+ "ⲋ": "ϛ",
163
+ "ⲍ": "ζ",
164
+ "ⲏ": "η",
165
+ "ⲑ": "θ",
166
+ "ⲓ": "ι",
167
+ "ⲕ": "κ",
168
+ "ⲗ": "λ",
169
+ "ⲙ": "μ",
170
+ "ⲛ": "ν",
171
+ "ⲝ": "ξ",
172
+ "ⲟ": "ο",
173
+ "ⲡ": "π",
174
+ "ⲣ": "ρ",
175
+ "ⲥ": "σ",
176
+ "ⲧ": "τ",
177
+ "ⲩ": "υ",
178
+ "ⲫ": "φ",
179
+ "ⲭ": "χ",
180
+ "ⲯ": "ψ",
181
+ "ⲱ": "ω",
182
+ "ϣ": "s",
183
+ "ϥ": "f",
184
+ "ϧ": "k",
185
+ "ϩ": "h",
186
+ "ϫ": "j",
187
+ "ϭ": "c",
188
+ "ϯ": "t",
189
+ }
190
+
191
+
192
+ def greekify(coptic_text):
193
+ chars = []
194
+ for c in coptic_text:
195
+ l_c = c.lower()
196
+ chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
197
+ return "".join(chars)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31bc6324553201d1662acb68318dc113126305371445ea9c10f958d72ff56192
3
+ size 308321413
setup_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name":"cop-en-norm-group-greekified",
3
+ "save_mode":"pretrained",
4
+ "max_length":"512"
5
+ }
source.spm ADDED
Binary file (707 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
target.spm ADDED
Binary file (791 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "eos_token": "</s>",
4
+ "model_max_length": 512,
5
+ "pad_token": "<pad>",
6
+ "separate_vocabs": false,
7
+ "source_lang": "mul",
8
+ "sp_model_kwargs": {},
9
+ "target_lang": "eng",
10
+ "tokenizer_class": "MarianTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff