Same model as stepfun-ai/GOT-OCR2_0 but using custom source code:

Remove the verovio dependency since most people don't need to OCR musical annotation.
Allow a user to use float16 if their GPU doesn't support bfloat16.
Updated to support Transformers==4.48.3 so it no longer gives a bunch of warning messages.

ORIGINAL MODEL CARD HERE

General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model

Haoran Wei*, Chenglong Liu*, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang

Usage

Inference using Huggingface transformers on NVIDIA GPUs. Requirements tested on python 3.10：

torch==2.0.1
torchvision==0.15.2
transformers==4.37.2
tiktoken==0.6.0
verovio==4.3.1
accelerate==0.28.0

from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()


# input your test image
image_file = 'xxx.jpg'

# plain texts OCR
res = model.chat(tokenizer, image_file, ocr_type='ocr')

# format texts OCR:
# res = model.chat(tokenizer, image_file, ocr_type='format')

# fine-grained OCR:
# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_box='')
# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_box='')
# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_color='')
# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_color='')

# multi-crop OCR:
# res = model.chat_crop(tokenizer, image_file, ocr_type='ocr')
# res = model.chat_crop(tokenizer, image_file, ocr_type='format')

# render the formatted OCR results:
# res = model.chat(tokenizer, image_file, ocr_type='format', render=True, save_render_file = './demo.html')

print(res)

More details about 'ocr_type', 'ocr_box', 'ocr_color', and 'render' can be found at our GitHub. Our training codes are available at our GitHub.

More Multimodal Projects

👏 Welcome to explore more multimodal projects of our team:

Vary | Fox | OneChart

Citation

If you find our work helpful, please consider citing our papers 📝 and liking this project ❤️！

@article{wei2024general,
  title={General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model},
  author={Wei, Haoran and Liu, Chenglong and Chen, Jinyue and Wang, Jia and Kong, Lingyu and Xu, Yanming and Ge, Zheng and Zhao, Liang and Sun, Jianjian and Peng, Yuang and others},
  journal={arXiv preprint arXiv:2409.01704},
  year={2024}
}
@article{liu2024focus,
  title={Focus Anywhere for Fine-grained Multi-page Document Understanding},
  author={Liu, Chenglong and Wei, Haoran and Chen, Jinyue and Kong, Lingyu and Ge, Zheng and Zhu, Zining and Zhao, Liang and Sun, Jianjian and Han, Chunrui and Zhang, Xiangyu},
  journal={arXiv preprint arXiv:2405.14295},
  year={2024}
}
@article{wei2023vary,
  title={Vary: Scaling up the Vision Vocabulary for Large Vision-Language Models},
  author={Wei, Haoran and Kong, Lingyu and Chen, Jinyue and Zhao, Liang and Ge, Zheng and Yang, Jinrong and Sun, Jianjian and Han, Chunrui and Zhang, Xiangyu},
  journal={arXiv preprint arXiv:2312.06109},
  year={2023}
}

Example Usage

import fitz
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import torch

# The following three lines are optional - removes the last remaining logging message from Transformers.
# import warnings
# from transformers import logging as transformers_logging
# transformers_logging.set_verbosity_error()

MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized"  # Replace with local path if desired
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    device_map='cuda',
    use_safetensors=True,
    pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
)
model = model.eval().cuda()

def clean_repetitive_lines(text):
    """
    Removes repetitive lines from the OCR output before saving the .txt file. This is necessary because
    the model sometimes produces OCR artifacts.  All duplicates above 2 instances are removed.
    """
    lines = text.split('\n')
    cleaned_lines = []
    i = 0
    while i < len(lines):
        cleaned_lines.append(lines[i])
        repeat_count = 1
        j = i + 1
        while j < len(lines) and lines[j] == lines[i]:
            repeat_count += 1
            j += 1
        if repeat_count > 2:
            if i + 1 < len(lines):
                cleaned_lines.append(lines[i + 1])
            i = j
        else:
            i += 1
    return '\n'.join(cleaned_lines)

@torch.inference_mode()
def process_pdf_for_ocr(tokenizer, model, pdf_path):
    pdf_document = fitz.open(pdf_path)
    full_text = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        zoom = 2
        matrix = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=matrix)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        # gradio_input=True is used because we're creating images for each page of a .pdf using PyMuPDF and Pillow instead of relying on the model's internal code
        res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
        
        if res.strip():
            full_text.append(res)

    complete_text = '\n'.join(full_text)
    cleaned_text = clean_repetitive_lines(complete_text)
    
    with open("extracted_text_got_ocr.txt", "w", encoding="utf-8") as f:
        f.write(cleaned_text)
    
    pdf_document.close()
    print("Results have been saved to extracted_text_got_ocr.txt")

# Example usage
pdf_path = "path/to/your/pdf"
process_pdf_for_ocr(tokenizer, model, pdf_path)