""" Copyright @ nguyenvanhieu.vn Thằng code python này không giữ được lower/upper case Sẽ update khi rảnh """ import re uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" def loaddicchar(): dic = {} char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( '|') charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( '|') for i in range(len(char1252)): dic[char1252[i]] = charutf8[i] return dic dicchar = loaddicchar() def convertwindown1525toutf8(txt): return re.sub( r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', lambda x: dicchar[x.group()], txt) """ Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey Ví dụ: thủy = thuyr, tượng = tuwowngj """ bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] nguyen_am_to_ids = {} for i in range(len(bang_nguyen_am)): for j in range(len(bang_nguyen_am[i]) - 1): nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j) def vn_word_to_telex_type(word): dau_cau = 0 new_word = '' for char in word: x, y = nguyen_am_to_ids.get(char, (-1, -1)) if x == -1: new_word += char continue if y != 0: dau_cau = y new_word += bang_nguyen_am[x][-1] new_word += bang_ky_tu_dau[dau_cau] return new_word def vn_sentence_to_telex_type(sentence): """ Chuyển câu tiếng việt có dấu về kiểu gõ telex. :param sentence: :return: """ words = sentence.split() for index, word in enumerate(words): words[index] = vn_word_to_telex_type(word) return ' '.join(words) """ Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF """ def chuan_hoa_dau_tu_tieng_viet(word): if not is_valid_vietnam_word(word): return word chars = list(word) dau_cau = 0 nguyen_am_index = [] qu_or_gi = False for index, char in enumerate(chars): x, y = nguyen_am_to_ids.get(char, (-1, -1)) if x == -1: continue elif x == 9: # check qu if index != 0 and chars[index - 1] == 'q': chars[index] = 'u' qu_or_gi = True elif x == 5: # check gi if index != 0 and chars[index - 1] == 'g': chars[index] = 'i' qu_or_gi = True if y != 0: dau_cau = y chars[index] = bang_nguyen_am[x][0] if not qu_or_gi or index != 1: nguyen_am_index.append(index) if len(nguyen_am_index) < 2: if qu_or_gi: if len(chars) == 2: x, y = nguyen_am_to_ids.get(chars[1]) chars[1] = bang_nguyen_am[x][dau_cau] else: x, y = nguyen_am_to_ids.get(chars[2], (-1, -1)) if x != -1: chars[2] = bang_nguyen_am[x][dau_cau] else: chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau] return ''.join(chars) return word for index in nguyen_am_index: x, y = nguyen_am_to_ids[chars[index]] if x == 4 or x == 8: # ê, ơ chars[index] = bang_nguyen_am[x][dau_cau] # for index2 in nguyen_am_index: # if index2 != index: # x, y = nguyen_am_to_ids[chars[index]] # chars[index2] = bang_nguyen_am[x][0] return ''.join(chars) if len(nguyen_am_index) == 2: if nguyen_am_index[-1] == len(chars) - 1: x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau] # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0] else: # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] else: # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]] # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0] return ''.join(chars) def is_valid_vietnam_word(word): chars = list(word) nguyen_am_index = -1 for index, char in enumerate(chars): x, y = nguyen_am_to_ids.get(char, (-1, -1)) if x != -1: if nguyen_am_index == -1: nguyen_am_index = index else: if index - nguyen_am_index != 1: return False nguyen_am_index = index return True def chuan_hoa_dau_cau_tieng_viet(sentence): """ Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. :param sentence: :return: """ sentence = sentence.lower() words = sentence.split() for index, word in enumerate(words): words[index] = chuan_hoa_dau_tu_tieng_viet(word) return ' '.join(words)