Spaces:

IncreasingLoss
/

DeepOCRTranslation_Chinese_English

Sleeping

App Files Files Community

DeepOCRTranslation_Chinese_English / app.py

IncreasingLoss

Update app.py

444e7de verified 4 months ago

raw

history blame contribute delete

22.2 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as FU
	from torchvision import transforms
	import numpy as np
	from ultralytics import YOLO
	from huggingface_hub import hf_hub_download
	import os
	import torch
	from pypinyin import pinyin
	from PIL import Image, ImageDraw, ImageEnhance
	import json
	import os
	import torchvision.transforms.functional as F
	from statistics import mean
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize LLM as None - will be loaded lazily
	llm = None

	def load_llm():
	"""Lazy loading of LLM to avoid startup delays"""
	global llm
	if llm is None:
	try:
	logger.info("Loading LLM model...")
	from llama_cpp import Llama

	# Check if model exists locally first
	model_filename = "Yi-1.5-9B-Chat-Q6_K.gguf"
	local_model_path = os.path.join("./models", model_filename)

	if not os.path.exists(local_model_path):
	logger.info("Downloading LLM model from HuggingFace...")
	model_path = hf_hub_download(
	repo_id="IncreasingLoss/FineTunedTranslation_Yi-1.5-9B-Chat-Q6_K",
	filename=model_filename,
	local_dir="./models",
	local_dir_use_symlinks=False
	)
	else:
	model_path = local_model_path
	logger.info(f"Using existing model at: {model_path}")

	# Initialize with conservative settings for HF Spaces
	llm = Llama(
	model_path=model_path,
	n_ctx=2048, # Reduced context size
	n_gpu_layers=0, # CPU only for HF Spaces compatibility
	verbose=False,
	n_threads=2, # Limit threads for shared environment
	use_mmap=True, # Memory mapping for efficiency
	use_mlock=False # Don't lock memory
	)
	logger.info("LLM model loaded successfully!")

	except Exception as e:
	logger.error(f"Failed to load LLM: {e}")
	llm = None

	return llm

	"""yolo model"""
	logger.info("Loading YOLO model...")
	user_device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {user_device}")

	try:
	detection_model = YOLO("model/yolo_chinese_m.pt").to(user_device).eval()
	logger.info("YOLO model loaded successfully!")
	except Exception as e:
	logger.error(f"Failed to load YOLO model: {e}")
	raise

	"""LW-Vit Classifier"""

	class HSwish(nn.Module):
	"""Hard Swish activation function"""
	def forward(self, x):
	out = x * FU.relu6(x + 3, inplace=True) / 6
	return out

	class MV2_Block(nn.Module):
	"""MobileNetV2 Inverted Residual Block with h-swish activation"""
	def __init__(self, in_channels, expand_channels, out_channels, stride):
	super().__init__()
	self.stride = stride
	self.use_res_connect = self.stride == 1 and in_channels == out_channels

	self.expand = nn.Sequential(
	nn.Conv2d(in_channels, expand_channels, kernel_size=1, stride=1, padding=0, bias=False),
	nn.BatchNorm2d(expand_channels),
	HSwish()
	)

	self.depthwise = nn.Sequential(
	nn.Conv2d(expand_channels, expand_channels, kernel_size=3, stride=stride,
	padding=1, groups=expand_channels, bias=False),
	nn.BatchNorm2d(expand_channels),
	HSwish()
	)

	self.project = nn.Sequential(
	nn.Conv2d(expand_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
	nn.BatchNorm2d(out_channels)
	)

	def forward(self, x):
	if self.use_res_connect:
	return x + self.project(self.depthwise(self.expand(x)))
	else:
	return self.project(self.depthwise(self.expand(x)))

	class LW_ViT_Transformer_Block(nn.Module):
	"""Lightweight Vision Transformer Block"""
	def __init__(self, in_channels, out_channels, patch_size=2, heads=4, dim=128):
	super().__init__()
	self.patch_size = patch_size
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.dim = dim

	# Downsampling and channel adjustment
	self.downsample = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=1)
	self.channel_adjust = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)

	# Calculate patch dimension
	patch_dim = out_channels * patch_size * patch_size
	# Add projection layer if needed
	self.projection = nn.Identity() if patch_dim == dim else nn.Linear(patch_dim, dim)

	# Transformer components
	self.norm1 = nn.LayerNorm(dim)
	self.attn = nn.MultiheadAttention(embed_dim=dim, num_heads=heads, batch_first=True)
	self.norm2 = nn.LayerNorm(dim)
	self.ffn = nn.Sequential(
	nn.Linear(dim, dim * 4),
	nn.GELU(),
	nn.Linear(dim * 4, dim)
	)

	# Final processing
	self.final_conv = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
	self.norm_out = nn.BatchNorm2d(out_channels)
	self.act = HSwish()

	def forward(self, x):
	# Initial downsampling and channel adjustment
	x = self.downsample(x)
	x = self.channel_adjust(x)

	B, C, H, W = x.shape

	# Convert feature map to patches and embed
	# Reshape to [B, C, H/P, P, W/P, P]
	x_reshaped = x.reshape(B, C, H // self.patch_size, self.patch_size,
	W // self.patch_size, self.patch_size)
	# Permute to [B, H/P, W/P, C, P, P]
	x_permuted = x_reshaped.permute(0, 2, 4, 1, 3, 5)
	# Reshape to [B, H/P * W/P, C * P * P]
	patches = x_permuted.reshape(B, (H // self.patch_size) * (W // self.patch_size), -1)

	# Apply projection if needed
	patches = self.projection(patches)

	# Apply transformer operations
	normed_patches = self.norm1(patches)
	attn_out, _ = self.attn(normed_patches, normed_patches, normed_patches)
	patches = patches + attn_out

	normed_patches = self.norm2(patches)
	ffn_out = self.ffn(normed_patches)
	patches = patches + ffn_out

	# Reshape back to feature map
	# Reshape to [B, H/P, W/P, C, P, P]
	x_back = patches.reshape(B, H // self.patch_size, W // self.patch_size,
	C, self.patch_size, self.patch_size)
	# Permute to [B, C, H/P, P, W/P, P]
	x_back = x_back.permute(0, 3, 1, 4, 2, 5)
	# Reshape to [B, C, H, W]
	x_out = x_back.reshape(B, C, H, W)

	# Final processing
	out = self.final_conv(x_out)
	out = self.norm_out(out)
	out = self.act(out)

	return out

	class LW_ViT(nn.Module):
	"""Lightweight Vision Transformer for Chinese Character Recognition"""
	def __init__(self, base_channels=16, num_classes=3892):
	super().__init__()

	# Stem layer
	self.stem = nn.Sequential(
	nn.Conv2d(3, base_channels, 3, stride=2, padding=1, bias=False),
	nn.BatchNorm2d(base_channels),
	HSwish()
	)

	# Feature extraction layers
	self.features = nn.Sequential(
	# MV2 blocks
	MV2_Block(base_channels, 4*base_channels, base_channels, stride=1),
	MV2_Block(base_channels, 4*base_channels, base_channels, stride=2),
	MV2_Block(base_channels, 4*base_channels, base_channels, stride=1),
	MV2_Block(base_channels, 4*base_channels, base_channels, stride=2),

	# Transformer block
	LW_ViT_Transformer_Block(
	in_channels=base_channels,
	out_channels=2*base_channels,
	patch_size=2,
	heads=4,
	dim=128
	),

	# Final convolutional layer
	nn.Conv2d(2base_channels, 4base_channels, kernel_size=1, stride=1),
	nn.BatchNorm2d(4*base_channels),
	HSwish()
	)

	# Global pooling and classifier
	self.global_pool = nn.AdaptiveAvgPool2d(1)
	self.dropout = nn.Dropout(0.1) # Added dropout for regularization

	self.classifier = nn.Sequential(
	nn.Linear(4base_channels, 8base_channels),
	nn.BatchNorm1d(8*base_channels),
	HSwish(),
	nn.Dropout(0.1), # Added dropout for regularization
	nn.Linear(8*base_channels, num_classes)
	)

	def forward(self, x):
	x = self.stem(x)
	x = self.features(x)
	x = self.global_pool(x)
	x = torch.flatten(x, 1)
	x = self.dropout(x)
	x = self.classifier(x)
	return x

	logger.info("Loading classifier model...")
	try:
	classifier = LW_ViT()
	classifier.load_state_dict(torch.load("model/chinese_Character_Classification_handwritten_LW_ViT.pth", weights_only=True, map_location=user_device))
	classifier = classifier.to(user_device)
	classifier.eval()
	logger.info("Classifier model loaded successfully!")
	except Exception as e:
	logger.error(f"Failed to load classifier: {e}")
	raise

	"""load classes dict"""
	logger.info("Loading classes dictionary...")
	try:
	with open("model/chinese_classes.json", "r", encoding="utf-8") as f:
	classes_dict = json.load(f)
	logger.info(f"Loaded {len(classes_dict)} classes")
	except Exception as e:
	logger.error(f"Failed to load classes dictionary: {e}")
	raise

	"""transforms"""
	class ToSquare(object):
	"""
	Transform to make images square by padding the shorter dimension
	"""
	def __init__(self, fill=0):
	self.fill = fill # Fill value for padding (0 = black)

	def __call__(self, img):
	w, h = img.size

	# If already square, return as is
	if w == h:
	return img

	# Calculate target size (max dimension)
	max_dim = max(w, h)

	# Calculate padding
	pad_w = (max_dim - w) // 2
	pad_h = (max_dim - h) // 2

	# Handle odd dimensions (extra pixel on one side)
	pad_w_extra = (max_dim - w) % 2
	pad_h_extra = (max_dim - h) % 2

	# Create padding list (left, top, right, bottom)
	padding = [pad_w, pad_h, pad_w + pad_w_extra, pad_h + pad_h_extra]

	# Create new padded image
	padded_img = F.pad(img, padding, self.fill)

	return padded_img

	class ConvertToRGB(object):
	"""Convert image to RGB mode"""
	def __call__(self, img):
	# Convert any image mode to RGB (including RGBA)
	return img.convert('RGB')

	class Contrast(object):
	"""Randomly adjust contrast"""
	def __init__(self, p=0.3, factor_range=(0.5, 1.5)):
	self.p = p
	self.factor_range = factor_range

	def __call__(self, img):
	enhancer = ImageEnhance.Contrast(img)
	return enhancer.enhance(1.5)

	test_transforms = transforms.Compose([
	ToSquare(fill=255),
	transforms.Resize(128),
	ConvertToRGB(),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.5], std=[0.5])
	])

	"""app functions"""
	def convert_to_pil(inputimg):
	# convert input to pil image
	if inputimg is None:
	return None
	if isinstance(inputimg, np.ndarray):
	pil_img = Image.fromarray(inputimg)
	elif hasattr(inputimg, 'convert'): # Check if it's already a PIL Image
	pil_img = inputimg
	else:
	pil_img = Image.fromarray(np.array(inputimg))
	pil_img = pil_img.convert("RGB")
	return pil_img

	def draw_bboxes(img, results):
	# Create a copy of the input image to draw on
	draw_img = img.copy()
	draw = ImageDraw.Draw(draw_img)
	# Draw bounding boxes for detected objects
	for box in results[0].boxes.xyxy:
	x1, y1, x2, y2 = map(int, box) # Convert to integers
	draw.rectangle([x1, y1, x2, y2], outline=(255, 0, 0), width=2)
	return draw_img

	def crop_bboxes_by_reading_order(img, bboxes, written_vertical=False):
	"""
	Crop and return image regions in proper reading order.

	For vertical Chinese:
	- Group into columns by x-coordinate (clusters whose centers are within avg_width/2)
	- Order columns right-to-left
	- Within each column, order top-to-bottom by y

	For horizontal:
	- Group into rows by y-coordinate (clusters within avg_height/2)
	- Order rows top-to-bottom
	- Within each row, order left-to-right by x
	"""
	# 1) If bboxes is a Tensor, convert to Python list
	if hasattr(bboxes, "tolist"):
	bboxes = bboxes.tolist()

	# 2) Single box -> trivial crop
	if len(bboxes) <= 1:
	return [img.crop(tuple(map(int, bbox))) for bbox in bboxes]

	# 3) Compute centers and average glyph size
	centers = [((x1+x2)/2, (y1+y2)/2) for x1,y1,x2,y2 in bboxes]
	widths = [x2 - x1 for x1,y1,x2,y2 in bboxes]
	heights = [y2 - y1 for x1,y1,x2,y2 in bboxes]
	avg_w = mean(widths)
	avg_h = mean(heights)

	# 4) Choose clustering axis and threshold
	if written_vertical:
	coords = [c[0] for c in centers] # x-coords
	threshold = avg_w / 2
	else:
	coords = [c[1] for c in centers] # y-coords
	threshold = avg_h / 2

	# 5) Sort glyphs by the clustering axis
	order = sorted(range(len(coords)), key=lambda i: coords[i])

	# 6) Build clusters
	clusters = []
	current = [order[0]]
	current_mean = coords[order[0]]

	for idx in order[1:]:
	c = coords[idx]
	# if within threshold of this cluster's mean, add; else start new
	if abs(c - current_mean) <= threshold:
	current.append(idx)
	# update cluster mean incrementally
	current_mean = mean([coords[i] for i in current])
	else:
	clusters.append(current)
	current = [idx]
	current_mean = c
	clusters.append(current)

	# 7) Order clusters
	if written_vertical:
	# Chinese vertical: rightmost column first
	clusters.sort(
	key=lambda grp: mean(coords[i] for i in grp),
	reverse=True
	)
	else:
	# Horizontal: top row first
	clusters.sort(
	key=lambda grp: mean(coords[i] for i in grp)
	)

	# 8) Within each cluster, sort by the orthogonal axis
	final_indices = []
	for grp in clusters:
	if written_vertical:
	# sort top-to-bottom by y
	grp.sort(key=lambda i: centers[i][1])
	else:
	# sort left-to-right by x
	grp.sort(key=lambda i: centers[i][0])
	final_indices.extend(grp)

	# 9) Crop and return
	crops = []
	for i in final_indices:
	x1, y1, x2, y2 = map(int, bboxes[i])
	crops.append(img.crop((x1, y1, x2, y2)))

	return crops

	def move_slider(threshhold_slider, input_img):
	pil_img = convert_to_pil(input_img)
	if pil_img is None:
	return None
	with torch.inference_mode():
	results = detection_model(source=pil_img, conf=threshhold_slider)
	draw_img = draw_bboxes(pil_img, results)
	return draw_img

	def select_image(evt: gr.SelectData):
	selected_index = evt.index
	return example_images[selected_index]

	def translate_text(threshhold_slider, is_vertical, input_img):
	# convert input to pil image
	pil_img = convert_to_pil(input_img)
	if pil_img is None:
	return "", "", "Please upload an image first."

	try:
	with torch.inference_mode():
	results = detection_model(source=pil_img, conf=threshhold_slider)
	sorted_cropped_images = crop_bboxes_by_reading_order(img=pil_img, bboxes=results[0].boxes.xyxy, written_vertical=is_vertical)
	chinese_text = ""
	for crop in sorted_cropped_images:
	crop = test_transforms(crop)
	crop = crop.to(user_device) # Move tensor to device
	crop_batch = crop.unsqueeze(dim=0) # Add batch dimension

	classifier.to(user_device)

	logits = classifier(crop_batch)
	class_idx = logits.argmax(dim=1)

	class_idx_cpu = class_idx.cpu().item() # Convert to CPU integer
	chinese_charcter = classes_dict[class_idx_cpu] # Use integer index for list
	chinese_text += chinese_charcter

	# Generate pinyin
	pinyin_text = pinyin(chinese_text)
	pinyin_sentence = " "
	for pin in pinyin_text:
	pinyin_sentence += f"{pin[0]} "

	# Load LLM lazily only when needed for translation
	current_llm = load_llm()

	if current_llm is None:
	return chinese_text, pinyin_sentence, "Translation service unavailable - LLM failed to load."

	# Generate translation
	prompt = f""" You are a professional Chinese to English translator.
	1. Translate the following Chinese text to natural, fluent English:
	"{chinese_text}"
	2. Respond only with the translated English text.
	English translation: """

	try:
	output = current_llm(
	prompt,
	max_tokens=min(len(chinese_text)*3, 256), # Conservative token limit
	stop=["</s>", "\n\n", "Chinese:", "Chinese text:"],
	echo=False,
	temperature=0.3,
	frequency_penalty=0.5,
	presence_penalty=0.5,
	top_p=0.9,
	stream=False
	)

	# Extract the translation
	translation = output["choices"][0]["text"].strip()

	# Clean up translation if it contains quotes
	try:
	if '"' in translation:
	start = translation.index('"') + 1
	end = translation.rindex('"')
	translation = translation[start:end]
	except:
	pass

	return chinese_text, pinyin_sentence, translation

	except Exception as e:
	logger.error(f"Translation failed: {e}")
	return chinese_text, pinyin_sentence, f"Translation failed: {str(e)}"

	except Exception as e:
	logger.error(f"OCR processing failed: {e}")
	return "", "", f"OCR processing failed: {str(e)}"

	css = """
	.centered-examples {
	margin: 0 auto !important;
	justify-content: center !important;
	gap: 8px !important;
	min-height: 150px !important;
	}
	.centered-examples .thumb {
	height: 100px !important;
	width: 100px !important;
	object-fit: cover !important;
	margin: 5px !important;
	}
	#my_media_gallery {
	min-height: 0 !important;
	max-height: none !important;
	height: auto !important;
	}

	#my_media_gallery * {
	min-height: 0 !important;
	}

	"""

	"""gradio app"""
	logger.info("Setting up Gradio interface...")

	# Check if examples directory exists
	example_dir = "examples"
	example_images = []
	if os.path.exists(example_dir):
	example_images = [os.path.join(example_dir, f) for f in os.listdir(example_dir)
	if f.lower().endswith(('.png', '.jpg', '.jpeg', '.webp'))]
	logger.info(f"Found {len(example_images)} example images")
	else:
	logger.warning(f"Examples directory '{example_dir}' not found")

	with gr.Blocks(css=css, title="DeepTranslate: Chinese OCR") as program:
	gr.Markdown("## DeepTranslate: Chinese OCR with translation to English")
	gr.Markdown("Upload or select an image and move the slider to detect characters.")
	gr.Markdown("Make sure that the input image is high resolution and not rotated in any way!")
	gr.Markdown("Spaces is very slow since its running on a 2 core cpu, expect translation times of 2-4 minutes. (12 seconds on a 4080)")
	#inputs
	with gr.Column(scale=1):
	if example_images:
	gallery = gr.Gallery(value=example_images,
	label="Example Images (Click to Select)",
	columns=6,
	height = "auto",
	allow_preview=False,
	elem_id="my_media_gallery",
	elem_classes=["centered-examples"])
	with gr.Row(scale=2):
	input_img = gr.Image(label="Input Image ")
	detection_img = gr.Image(label="Detection Image", interactive=False)

	# slider and button
	with gr.Column(scale=1):
	with gr.Row(scale=3):
	threshhold_slider = gr.Slider(value=0.25, minimum=0, maximum=0.75, label="Detection Threshold", step=0.01)
	translate_button = gr.Button("Translate To English", variant="primary")
	is_vertical = gr.Checkbox(value=False, label="Vertical Chinese Text?", interactive=True)

	# outputs
	with gr.Column(scale=1):
	with gr.Row(scale=3):
	chinese_text = gr.TextArea(label="Chinese Text", max_lines=1000, interactive=False)
	pinyin_text = gr.TextArea(label="Chinese Pinyin", max_lines=1000, interactive=False)
	english_text = gr.TextArea(label="English Text", max_lines=1000, interactive=False)

	# function calling
	threshhold_slider.change(fn=move_slider, inputs=[threshhold_slider, input_img], outputs=[detection_img])
	translate_button.click(fn=translate_text, inputs=[threshhold_slider, is_vertical, input_img], outputs=[chinese_text, pinyin_text, english_text])

	if example_images:
	gallery.select(fn=select_image, inputs=None, outputs=input_img)

	logger.info("Gradio interface ready!")

	if __name__ == "__main__":
	program.launch(share=False, server_name="0.0.0.0", show_error=True)