Spaces:

OneFi
/

hf-similarity-check

Sleeping

hf-similarity-check / checkTool.py

Mitul Mohammad Abdullah Al Mukit

first commit

1f72938 about 2 years ago

6.35 kB

	import re

	def check_integer(string):
	if string.isdigit():
	return True

	for char in string:
	if char.isdigit():
	return True
	return False

	def check_alpha(string):
	for char in string:
	if not ((char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z') or char == ' '):
	return False
	return True

	def is_chinese_name(text):
	substrings = [text[:1], text[:2], text[:3], text[:4], text[:5], text[:6], text[:7], text[:8]]

	if len(text) > 40:
	return False

	for substring in substrings:
	upper_case_sum = 0
	lower_case_sum = 0
	space = 0
	for char in substring:
	if char >= 'A' and char <= 'Z':
	upper_case_sum += 1
	if char >= 'a' and char <= 'z':
	lower_case_sum += 1
	if char == ' ':
	space += 1
	if upper_case_sum >= 3 and lower_case_sum >= 2 and space >= 1:
	return True

	return False

	def seperate_name(text):
	word1 = ""
	word2 = ""
	word3 = ""
	name = text.replace(' ', '')
	# l = 0
	# space = 0
	# for char in text:
	# if char >= 'A' and char <= 'Z':
	# l += 1
	# if char != ' ':
	# space += 1
	# else:
	# word2 = text[l-1:space]
	# word3 = text[space+1::]
	# word1 = text[:l - 2]

	# # only two characters
	# if space == len(text):
	# word1 = text[:l-1]
	# word2 = text[l-1::]
	# name = word1 + ' ' + word2
	# else:
	# name = word1 + ' ' + word2 + ' ' + word3
	return name.lower()

	def validate_hkid(hkid): # omit parentheses
	hkid = hkid.replace('(', '').replace(')', '')

	weight = [9, 8, 7, 6, 5, 4, 3, 2, 1]
	values = list('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ') + [None]

	match = re.match('^([A-Z])?([A-Z])([0-9]{6})([0-9A])$', hkid)
	if not match: return False

	hkidArr = []
	for g in match.groups():
	hkidArr += list(g) if g else [g]

	r = sum([values.index(i) * w for i, w in zip(hkidArr, weight)]) % 11

	return r == 0

	def format_HKID(hkid):
	hkid = hkid.replace('(', '').replace(')', '')
	idlen = len(hkid)

	match = re.match('^([A-Z])?([A-Z])([0-9]{6})([0-9A])$', hkid)

	hkidArr = []
	for g in match.groups():
	hkidArr += list(g) if g else [g]

	formatted_hkid = ''

	index = 0
	for char in hkidArr:
	if char != None:
	formatted_hkid += char
	if index == idlen - 1:
	formatted_hkid += '('
	if index == idlen:
	formatted_hkid += ')'
	index += 1

	return formatted_hkid

	def format_issuedate(issuedate):
	formatted_issuedate = issuedate.replace('(', '').replace(')', '')
	formatted_issuedate = formatted_issuedate.replace('C', '')
	return formatted_issuedate

	def is_string_integer(string):
	try:
	int(string) # Attempt to convert the string to an integer
	return True # If successful, the string only contains integers
	except ValueError:
	return False # If a ValueError occurs, the string doesn't only contain integers

	def check_issuedate(text):
	if len(text) < 5 and len(text) > 7 :
	return False
	if len(text) > 0 and text[0] == '(':
	text = text.replace('(', '')
	elif len(text) > 0 and text[0] == 'C':
	text = text.replace('C', '')
	if len(text) > 0 and text[-1] == ')':
	text = text.replace(')', '')
	if len(text) != 5:
	return False
	if text[2] != '-':
	return False
	text = text.replace('-', '')
	if not is_string_integer(text):
	return False
	return True

	def print_info(name, valid_hkid, hkid, issuedate):
	print(f'Name: {name}')
	print(f'HKID: {hkid} and validity: {valid_hkid}')
	print(f'Date of issue: {issuedate}')

	def is_comma_present(string):
	return ',' in string

	def longest_common_subsequence(s1, s2):
	m, n = len(s1), len(s2)
	# Create a 2D table to store the lengths of common subsequences
	dp = [[0] * (n + 1) for _ in range(m + 1)]

	# Build the table in a bottom-up manner
	for i in range(1, m + 1):
	for j in range(1, n + 1):
	if s1[i - 1] == s2[j - 1]:
	dp[i][j] = dp[i - 1][j - 1] + 1
	else:
	dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

	# Retrieve the longest common subsequence
	lcs = []
	i, j = m, n
	while i > 0 and j > 0:
	if s1[i - 1] == s2[j - 1]:
	lcs.append(s1[i - 1])
	i -= 1
	j -= 1
	elif dp[i - 1][j] > dp[i][j - 1]:
	i -= 1
	else:
	j -= 1

	# Reverse the sequence to get the correct order
	lcs.reverse()
	return ''.join(lcs)

	def combine_info(info1, info2):
	combined_info = []

	print(info1)
	print(info2)

	if info1[0] == info2[0]:
	combined_info.append(info1[0]) # Append the variable as-is if it's the same in both models
	elif info1[0] == '':
	combined_info.append(info2[0])
	elif info2[0] == '':
	combined_info.append(info1[0])
	else:
	subseq = longest_common_subsequence(info1[0], info2[0])
	combined_info.append(subseq)

	if info1[1] == 'True' and info2[1] == 'False':
	combined_info.append(info1[1])
	combined_info.append(info1[2])
	elif info1[1] == 'False' and info2[1] == 'True':
	combined_info.append(info2[1])
	combined_info.append(info2[2])
	elif info1[1] == 'True' and info2[1] == 'True':
	if info1[2] == info2[2]:
	combined_info.append(info1[1])
	combined_info.append(info1[2])
	else:
	combined_info.append('False')
	combined_info.append('Suspicous HKID')

	if info1[3] == info2[3]:
	combined_info.append(info1[3])
	else:
	combined_info.append('Unmatched issuedate')

	# print(combined_info)

	return combined_info



	# info1 = ['', 'True', 'Z683365(5)', '06-96']
	# info2 = ['lok wing', 'False', 'Z68336505)', '06-96']
	# info = combine_info(info1, info2)
	# print_info(*info)


	# text = 'TAMKing Man'
	# if is_comma_present(text):
	# text = text.replace(',', '')
	# if not check_integer(text):
	# if check_alpha(text) and is_chinese_name(text):
	# name = seperate_name(text)