Add custom pipeline

#1
by alinoc - opened
Files changed (3) hide show
  1. pipeline.py +43 -0
  2. requirements.txt +6 -0
  3. translation.py +93 -0
pipeline.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
3
+ from translation import fix_tokenizer, TextPreprocessor, sentenize_with_fillers
4
+ from sentence_splitter import SentenceSplitter
5
+ import torch
6
+
7
+ class PreTrainedPipeline():
8
+ def __init__(self, path=""):
9
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
10
+ if torch.cuda.is_available():
11
+ self.model = self.model.cuda()
12
+ self.tokenizer = NllbTokenizer.from_pretrained(path)
13
+ fix_tokenizer(self.tokenizer)
14
+ self.splitter = SentenceSplitter(language='es')
15
+ self.preprocessor = TextPreprocessor()
16
+
17
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
18
+ inputs = data.get("text", "")
19
+ src_lang = data.get("src_lang", "spa_Latn")
20
+ tgt_lang = data.get("tgt_lang", "agr_Latn")
21
+ preprocess = data.get("preprocess", True)
22
+
23
+ sentences, fillers = sentenize_with_fillers(inputs, self.splitter)
24
+ if preprocess:
25
+ sentences = [self.preprocessor(sent) for sent in sentences]
26
+
27
+ translated_sentences = []
28
+ for sentence in sentences:
29
+ self.tokenizer.src_lang = src_lang
30
+ encoded = self.tokenizer(sentence, return_tensors="pt")
31
+ generated_tokens = self.model.generate(
32
+ **encoded.to(self.model.device),
33
+ forced_bos_token_id=self.tokenizer.lang_code_to_id[tgt_lang]
34
+ )
35
+ translated_sentences.append(
36
+ self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
37
+ )
38
+
39
+ output = "".join(
40
+ filler + sentence for filler, sentence in zip(fillers, translated_sentences)
41
+ ) + fillers[-1]
42
+
43
+ return {"translation": output}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ requests==2.27.*
2
+ sentencepiece==0.1.*
3
+ torch==1.11.*
4
+ transformers==4.33.*
5
+ sentence-splitter==1.4
6
+ sacremoses== 0.0.45
translation.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ import typing as tp
4
+ import unicodedata
5
+
6
+ import torch
7
+ from sacremoses import MosesPunctNormalizer
8
+ from sentence_splitter import SentenceSplitter
9
+ from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
10
+
11
+ L1 = "spa_Latn"
12
+ L2 = "agr_Latn"
13
+ LANGUAGES = {
14
+ "Spanish | spa": L1,
15
+ "Awajun | agr": L2,
16
+ }
17
+
18
+ def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
19
+ non_printable_map = {
20
+ ord(c): replace_by
21
+ for c in (chr(i) for i in range(sys.maxunicode + 1))
22
+ # same as \p{C} in perl
23
+ # see https://www.unicode.org/reports/tr44/#General_Category_Values
24
+ if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
25
+ }
26
+
27
+ def replace_non_printing_char(line) -> str:
28
+ return line.translate(non_printable_map)
29
+
30
+ return replace_non_printing_char
31
+
32
+ class TextPreprocessor:
33
+ """
34
+ Mimic the text preprocessing made for the NLLB model.
35
+ This code is adapted from the Stopes repo of the NLLB team:
36
+ https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214
37
+ """
38
+
39
+ def __init__(self, lang="en"):
40
+ self.mpn = MosesPunctNormalizer(lang=lang)
41
+ self.mpn.substitutions = [
42
+ (re.compile(r), sub) for r, sub in self.mpn.substitutions
43
+ ]
44
+ self.replace_nonprint = get_non_printing_char_replacer(" ")
45
+
46
+ def __call__(self, text: str) -> str:
47
+ clean = self.mpn.normalize(text)
48
+ clean = self.replace_nonprint(clean)
49
+ # replace π“•π”―π”žπ”«π” π”’π”°π” π”ž by Francesca
50
+ clean = unicodedata.normalize("NFKC", clean)
51
+ return clean
52
+
53
+ def fix_tokenizer(tokenizer, new_lang=L2):
54
+ """Add a new language token to the tokenizer vocabulary
55
+ (this should be done each time after its initialization)
56
+ """
57
+ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
58
+ tokenizer.lang_code_to_id[new_lang] = old_len - 1
59
+ tokenizer.id_to_lang_code[old_len - 1] = new_lang
60
+ # always move "mask" to the last position
61
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = (
62
+ len(tokenizer.sp_model)
63
+ + len(tokenizer.lang_code_to_id)
64
+ + tokenizer.fairseq_offset
65
+ )
66
+
67
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
68
+ tokenizer.fairseq_ids_to_tokens = {
69
+ v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()
70
+ }
71
+ if new_lang not in tokenizer._additional_special_tokens:
72
+ tokenizer._additional_special_tokens.append(new_lang)
73
+ # clear the added token encoder; otherwise a new token may end up there by mistake
74
+ tokenizer.added_tokens_encoder = {}
75
+ tokenizer.added_tokens_decoder = {}
76
+
77
+ def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
78
+ """Apply a sentence splitter and return the sentences and all separators before and after them"""
79
+ if fix_double_space:
80
+ text = re.sub(" +", " ", text)
81
+ sentences = splitter.split(text)
82
+ fillers = []
83
+ i = 0
84
+ for sentence in sentences:
85
+ start_idx = text.find(sentence, i)
86
+ if ignore_errors and start_idx == -1:
87
+ # print(f"sent not found after {i}: `{sentence}`")
88
+ start_idx = i + 1
89
+ assert start_idx != -1, f"sent not found after {i}: `{sentence}`"
90
+ fillers.append(text[i:start_idx])
91
+ i = start_idx + len(sentence)
92
+ fillers.append(text[i:])
93
+ return sentences, fillers