Spaces:
Sleeping
Sleeping
import re | |
import torch | |
class CharMapper: | |
lower2upper = { | |
ord(u"i"): u"İ", | |
ord(u"ı"): u"I" | |
} | |
upper2lower = { | |
ord(u"İ"): u"i", | |
ord(u"I"): u"ı" | |
} | |
def __init__(self, letters: str = "0123456789abcçdefgğhıijklmnoöpqrsştuüvwxyz", maxLength: int = 25): | |
self.letters = letters | |
self.maxLength = maxLength | |
self.map = {"[END]": 0} | |
self.reverseMap = {0: "[END]"} | |
i = 1 | |
for l in self.letters: | |
self.map[l] = i | |
self.reverseMap[i] = l | |
i += 1 | |
self.map["[PAD]"] = i | |
self.reverseMap[i] = "[PAD]" | |
return | |
def __call__(self, text: str, return_length=False): | |
text = self.text2label(text) | |
length = len(text) + 1 | |
mappedText = torch.tensor([self.map[l] for l in text] + [self.map["[END]"]]) | |
text = torch.ones((self.maxLength + 1,)) * self.map["[PAD]"] | |
text[:len(mappedText)] = mappedText | |
if return_length: | |
return text, length | |
else: | |
return text | |
def reverseMapper(self, label: torch.tensor): | |
label = label.cpu() | |
text = "".join([self.reverseMap[l] for l in label.numpy()]) | |
return text.split("[END]")[0] | |
def text2label(self, text): | |
text = re.sub('[^0-9a-zA-ZğüşöçıİĞÜŞÖÇ]+', '', text) | |
text = text.translate(self.upper2lower).lower() | |
return text | |
if __name__ == '__main__': | |
mapper = CharMapper() | |
mapped = mapper("!MA-PİŞ$Z") | |
print(mapped) | |