hf-similarity-check / get_chinese_code.py
Mitul Mohammad Abdullah Al Mukit
updates
e029c8d
raw
history blame
1.62 kB
from cnocr import CnOcr
import pandas as pd
def check_telecode(input_string):
if len(input_string) == 12:
try:
int(input_string)
return True
except ValueError:
return False
if len(input_string) == 8:
try:
int(input_string)
return True
except ValueError:
return False
return False
def extract_integers(input_string):
if len(input_string) == 12:
w1 = input_string[:4]
w2 = input_string[4:8]
w3 = input_string[8:]
return w1, w2, w3
elif len(input_string) == 8:
w1 = input_string[:4]
w2 = input_string[4:]
return w1, w2
else:
return None, None, None
def get_chinese_name(path):
ocr = CnOcr(rec_model_name='en_PP-OCRv3')
# ocr = CnOcr(rec_model_name='densenet_lite_136-fc')
out = ocr.ocr(path)
df = pd.read_csv('hkTelecode.csv', dtype={'code': str}, index_col=False)
chinese_name = []
for data in out:
text = data['text']
text = text.replace(' ', '')
if check_telecode(text):
w1, w2, w3 = extract_integers(text)
print(w1)
print(w2)
print(w3)
chinese_name.append(df['word'][df['code'] == str(w1)].iloc[0])
chinese_name.append(df['word'][df['code'] == str(w2)].iloc[0])
if w3 is not None:
chinese_name.append(df['word'][df['code'] == str(w3)].iloc[0])
return chinese_name
chinese_name = []
return []
print(get_chinese_name('dontTouchMe/IMG_4495.jpg'))