Spaces:

Suevar
/

OCR

Runtime error

App Files Files Community

OCR / OCR_web.py

Suevar

Upload 12 files

2bcf29e verified 6 months ago

raw

history blame contribute delete

No virus

9.09 kB

	from paddleocr import PaddleOCR, draw_ocr
	from PIL import Image
	import math
	import numpy as np
	import cv2
	import re
	import tempfile
	from pathlib import Path

	def image_split(imageBytes):
	# Read image from imageBytes
	image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
	(h, w) = image.shape[:2]

	sub_images = []
	# If height is greater than or equal to 2000
	if h >= 2000:
	# Split image into multiple images
	num = int(math.ceil(h / 2000))
	height = int(h / num)

	for i in range(num):
	# Calculate the dimensions of the sub-image
	startY = i * height
	endY = (i + 1) * height
	width = w
	dim = (width, endY - startY)
	# Double check that the dimensions are within bounds
	if dim[0] <= w and dim[1] <= h:
	# Resize the sub-image
	sub_image = cv2.resize(image[startY:endY, 0:w], dim, interpolation=cv2.INTER_AREA)
	# Save the sub-image
	# cv2.imwrite('test_{}.png'.format(i),sub_image)
	# Add the sub-image to the list
	sub_images.append(sub_image)
	else:
	sub_images.append(image)
	return sub_images

	def image_resize(image):
	#image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
	(h,w) = image.shape[:2]
	WIDTH = 541
	# Calculate the ratio of height and width
	r = WIDTH / float(w)
	# Create a dimension with the calculated width and height
	dim = (WIDTH, int(h * r))
	# Resize the image
	image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)

	return image

	def msg_type(line_list, image, ori_width):
	resize_ratio = 541 / ori_width
	if len(line_list) == 2:
	# inner_lines = line_list[0]
	words = line_list[1][0]
	x = int(line_list[0][0][0] * resize_ratio)
	y_min = int(line_list[0][0][1] * resize_ratio)
	y_max = int(line_list[0][3][1] * resize_ratio)
	y_mid = y_min + int((y_max-y_min)/2) # 计算首个文字的正中心起始位置，确保后续计算对话框背景颜色更准确
	color = image[y_mid][x]
	# print("1111")

	return words, (x, y_mid), (y_min, y_max), color
	else:
	return None, (None, None), None

	def date_delete(words, line_text, line, dialog, res):

	data_pattern1 = re.compile(r'^\d{1,2}:\d{1,2}$') # 剔除24：24的时间戳
	if bool(data_pattern1.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern2 = re.compile(r'(凌晨\|上午\|下午\|晚上)\d{1,2}:\d{1,2}$') # 剔除凌晨\|上午\|下午\|晚上24：24的时间戳
	if bool(data_pattern2.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern3 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') # 剔除2024年4月12日24：24的时间戳
	if bool(data_pattern3.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern4 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日(凌晨\|上午\|下午\|晚上)\d{1,2}:\d{1,2}$') # 剔除2024年4月12日凌晨\|上午\|下午\|晚上24：24的时间戳
	if bool(data_pattern4.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern5 = re.compile(r'^昨天\d{1,2}:\d{1,2}$') # 剔除昨天24：24的时间戳
	if bool(data_pattern5.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern6 = re.compile(r'昨天(凌晨\|上午\|下午\|晚上)\d{1,2}:\d{1,2}$') # 剔除昨天凌晨\|上午\|下午\|晚上24：24的时间戳
	if bool(data_pattern6.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern7 = re.compile(r'^(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') # 剔除4月12日24：24的时间戳
	if bool(data_pattern7.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern8 = re.compile(r'^(\d{1,2})月(\d{1,2})日(凌晨\|上午\|下午\|晚上)\d{1,2}:\d{1,2}$') # 剔除4月12日(凌晨\|上午\|下午\|晚上)24：24的时间戳
	if bool(data_pattern8.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern9 = re.compile(r'(星期一\|星期二\|星期三\|星期四\|星期五\|星期六\|星期日)\d{1,2}:\d{1,2}$') # 剔除星期一24：24的时间戳
	if bool(data_pattern9.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	data_pattern10 = re.compile(r'(星期一\|星期二\|星期三\|星期四\|星期五\|星期六\|星期日)(凌晨\|上午\|下午\|晚上)\d{1,2}:\d{1,2}$') # 剔除星期一(凌晨\|上午\|下午\|晚上)24：24的时间戳
	if bool(data_pattern10.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	return True

	def OCR_text(img_path: str, output_path: str) -> Path:
	ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv4")

	with open(img_path, "rb") as f:
	imageBytes = f.read()
	image_list = image_split(imageBytes)

	result = ocr.ocr(img_path, cls=True)
	dialog = []
	line_text = ''
	# y_max_last = 0
	filename = ''

	# result0 = result[0]
	# image_RGB = Image.open(img_path).convert('RGB')
	# boxes = [line[0] for line in result0]
	# txts = [line[1][0] for line in result0]
	# scores = [line[1][1] for line in result0]
	# im_show = draw_ocr(image_RGB, boxes, txts, scores, font_path='./ppocr_img/fonts/simfang.ttf')
	# imshow = Image.fromarray(im_show)
	# imshow.save('result.jpg')

	output_txt_path = Path(output_path) / "result.txt"

	for image in image_list:
	ori_width = image.shape[1]

	image = image_resize(image)
	for idx in range(len(result)):
	res = result[idx]
	for line in res:
	# print(line)
	words, (x, y_mid), (y_min, y_max), color = msg_type(line, image, ori_width)

	if line[1][1] < 0.7:
	if line == res[-1]:
	dialog.append(line_text)
	continue

	if words in ['...', '我通过了你的朋友验证请求，现在', '我们可以开始聊天了', '以上是打招呼的内容', '三',
	'+', '你撤回了一条消息', 'HD', ':', '按住说话', '以下是新消息', '川']:
	if line == res[-1]:
	dialog.append(line_text)
	continue

	if date_delete(words, line_text, line, dialog, res):
	continue

	data_pattern = re.compile(r'^\d{1,3}"$') # 剔除语音
	if bool(data_pattern.match(words)):
	if line == res[-1]:
	dialog.append(line_text)
	continue

	### 使用颜色判断（待进一步测试）
	if color[0] > 160 and color[1] > 160 and color[2] > 160: # 背景颜色为白色，则判断为speaker1
	if line_text == '':
	line_text = 'Speaker1：' + words
	y_max_last = y_max
	elif (y_min - y_max_last) < (y_max - y_min):
	line_text = line_text + words
	y_max_last = y_max
	else:
	dialog.append(line_text)
	line_text = 'Speaker1：' + words
	y_max_last = y_max
	else: # 否则判定为speaker2
	if line_text == '':
	line_text = 'Speaker2:' + words
	y_max_last = y_max
	elif (y_min - y_max_last) < (y_max - y_min):
	line_text = line_text + words
	y_max_last = y_max
	else:
	dialog.append(line_text)
	line_text = 'Speaker2:' + words
	y_max_last = y_max

	line_point = line
	test_point = res[-1]
	if line == res[-1]:
	dialog.append(line_text)

	### 保存识别到的结果为txt
	if filename == '':
	with open(output_txt_path, 'a+', encoding='utf-8') as f:
	for dia in dialog:
	f.write(dia + '\n')
	else:
	with open(filename.replace('/', '.') + '.txt', 'a+', encoding='utf-8') as f:
	for dia in dialog:
	f.write(dia + '\n')

	print(f'output dir is: {output_txt_path}')
	return output_txt_path





	if __name__ == '__main__':
	img_path = './image/3.png'
	output_path = tempfile.mkdtemp()
	OCR_text(img_path, output_path)