hf-similarity-check / checkTool.py
Mitul Mohammad Abdullah Al Mukit
first commit
1f72938
import re
def check_integer(string):
if string.isdigit():
return True
for char in string:
if char.isdigit():
return True
return False
def check_alpha(string):
for char in string:
if not ((char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z') or char == ' '):
return False
return True
def is_chinese_name(text):
substrings = [text[:1], text[:2], text[:3], text[:4], text[:5], text[:6], text[:7], text[:8]]
if len(text) > 40:
return False
for substring in substrings:
upper_case_sum = 0
lower_case_sum = 0
space = 0
for char in substring:
if char >= 'A' and char <= 'Z':
upper_case_sum += 1
if char >= 'a' and char <= 'z':
lower_case_sum += 1
if char == ' ':
space += 1
if upper_case_sum >= 3 and lower_case_sum >= 2 and space >= 1:
return True
return False
def seperate_name(text):
word1 = ""
word2 = ""
word3 = ""
name = text.replace(' ', '')
# l = 0
# space = 0
# for char in text:
# if char >= 'A' and char <= 'Z':
# l += 1
# if char != ' ':
# space += 1
# else:
# word2 = text[l-1:space]
# word3 = text[space+1::]
# word1 = text[:l - 2]
# # only two characters
# if space == len(text):
# word1 = text[:l-1]
# word2 = text[l-1::]
# name = word1 + ' ' + word2
# else:
# name = word1 + ' ' + word2 + ' ' + word3
return name.lower()
def validate_hkid(hkid): # omit parentheses
hkid = hkid.replace('(', '').replace(')', '')
weight = [9, 8, 7, 6, 5, 4, 3, 2, 1]
values = list('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ') + [None]
match = re.match('^([A-Z])?([A-Z])([0-9]{6})([0-9A])$', hkid)
if not match: return False
hkidArr = []
for g in match.groups():
hkidArr += list(g) if g else [g]
r = sum([values.index(i) * w for i, w in zip(hkidArr, weight)]) % 11
return r == 0
def format_HKID(hkid):
hkid = hkid.replace('(', '').replace(')', '')
idlen = len(hkid)
match = re.match('^([A-Z])?([A-Z])([0-9]{6})([0-9A])$', hkid)
hkidArr = []
for g in match.groups():
hkidArr += list(g) if g else [g]
formatted_hkid = ''
index = 0
for char in hkidArr:
if char != None:
formatted_hkid += char
if index == idlen - 1:
formatted_hkid += '('
if index == idlen:
formatted_hkid += ')'
index += 1
return formatted_hkid
def format_issuedate(issuedate):
formatted_issuedate = issuedate.replace('(', '').replace(')', '')
formatted_issuedate = formatted_issuedate.replace('C', '')
return formatted_issuedate
def is_string_integer(string):
try:
int(string) # Attempt to convert the string to an integer
return True # If successful, the string only contains integers
except ValueError:
return False # If a ValueError occurs, the string doesn't only contain integers
def check_issuedate(text):
if len(text) < 5 and len(text) > 7 :
return False
if len(text) > 0 and text[0] == '(':
text = text.replace('(', '')
elif len(text) > 0 and text[0] == 'C':
text = text.replace('C', '')
if len(text) > 0 and text[-1] == ')':
text = text.replace(')', '')
if len(text) != 5:
return False
if text[2] != '-':
return False
text = text.replace('-', '')
if not is_string_integer(text):
return False
return True
def print_info(name, valid_hkid, hkid, issuedate):
print(f'Name: {name}')
print(f'HKID: {hkid} and validity: {valid_hkid}')
print(f'Date of issue: {issuedate}')
def is_comma_present(string):
return ',' in string
def longest_common_subsequence(s1, s2):
m, n = len(s1), len(s2)
# Create a 2D table to store the lengths of common subsequences
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Build the table in a bottom-up manner
for i in range(1, m + 1):
for j in range(1, n + 1):
if s1[i - 1] == s2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
# Retrieve the longest common subsequence
lcs = []
i, j = m, n
while i > 0 and j > 0:
if s1[i - 1] == s2[j - 1]:
lcs.append(s1[i - 1])
i -= 1
j -= 1
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1
# Reverse the sequence to get the correct order
lcs.reverse()
return ''.join(lcs)
def combine_info(info1, info2):
combined_info = []
print(info1)
print(info2)
if info1[0] == info2[0]:
combined_info.append(info1[0]) # Append the variable as-is if it's the same in both models
elif info1[0] == '':
combined_info.append(info2[0])
elif info2[0] == '':
combined_info.append(info1[0])
else:
subseq = longest_common_subsequence(info1[0], info2[0])
combined_info.append(subseq)
if info1[1] == 'True' and info2[1] == 'False':
combined_info.append(info1[1])
combined_info.append(info1[2])
elif info1[1] == 'False' and info2[1] == 'True':
combined_info.append(info2[1])
combined_info.append(info2[2])
elif info1[1] == 'True' and info2[1] == 'True':
if info1[2] == info2[2]:
combined_info.append(info1[1])
combined_info.append(info1[2])
else:
combined_info.append('False')
combined_info.append('Suspicous HKID')
if info1[3] == info2[3]:
combined_info.append(info1[3])
else:
combined_info.append('Unmatched issuedate')
# print(combined_info)
return combined_info
# info1 = ['', 'True', 'Z683365(5)', '06-96']
# info2 = ['lok wing', 'False', 'Z68336505)', '06-96']
# info = combine_info(info1, info2)
# print_info(*info)
# text = 'TAMKing Man'
# if is_comma_present(text):
# text = text.replace(',', '')
# if not check_integer(text):
# if check_alpha(text) and is_chinese_name(text):
# name = seperate_name(text)