import numpy as np import gradio as gr from transformers import AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel import re import jaconv #load model model_path = "model/" feature_extractor = AutoFeatureExtractor.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) model = VisionEncoderDecoderModel.from_pretrained(model_path) examples = ['examples/01.png', 'examples/02.png', 'examples/03.png', 'examples/04.png', 'examples/05.png', 'examples/06.png', 'examples/07.png' ] def post_process(text): text = ''.join(text.split()) text = text.replace('…', '...') text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text) text = jaconv.h2z(text, ascii=True, digit=True) return text def infer(image): image = image.convert('L').convert('RGB') pixel_values = feature_extractor(image, return_tensors="pt").pixel_values ouput = model.generate(pixel_values)[0] text = tokenizer.decode(ouput, skip_special_tokens=True) text = post_process(text) return text iface = gr.Interface( fn=infer, inputs=[gr.inputs.Image(label="Input", type="pil")], outputs="text", layout="horizontal", theme="huggingface", title="Optical Character Recognition for Japanese Text", description="A simple interface for OCR from Japanese manga", article= "Author: Vu Minh Chien. ", allow_flagging='never', examples=examples, cache_examples=True, ) iface.launch(enable_queue=True)