tencent
/

HunyuanOCR

@@ -6,6 +6,7 @@ language:
 pipeline_tag: image-text-to-text
 library_name: transformers
 ---
 <div align="center">
 # HunyuanOCR
@@ -45,6 +46,25 @@ from transformers import HunYuanVLForConditionalGeneration
 from PIL import Image
 import torch
 model_name_or_path = "tencent/HunyuanOCR"
 processor = AutoProcessor.from_pretrained(model_name_or_path, use_fast=False)
 img_path = "path/to/your/image.jpg"
@@ -93,9 +113,9 @@ else:
 generated_ids_trimmed = [
     out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
 ]
-output_texts = processor.batch_decode(
     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
 print(output_texts)
 ```
@@ -114,9 +134,9 @@ from PIL import Image
 from transformers import AutoProcessor
 model_path = "tencent/HunyuanOCR"
-llm = LLM(model=model_path)
 processor = AutoProcessor.from_pretrained(model_path)
-sampling_params = SamplingParams(temperature=0.0, max_tokens=16384)
 img_path = "/path/to/image.jpg"
 img = Image.open(img_path)
@@ -143,14 +163,15 @@ print(output.outputs[0].text)
 ## 📚 Citation
-@misc{hunyuanocr2025,
-    title={HunyuanOCR Technical Report},
-    author={Tencent Hunyuan Vision Team},
-    year={2025},
-    publisher={GitHub},
-    journal={GitHub repository},
-    howpublished={\url{https://github.com/Tencent-Hunyuan/HunyuanOCR}}
 }
 ## 🙏 Acknowledgements
 We would like to thank [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [MinerU](https://github.com/opendatalab/MinerU), [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR), [dots.ocr](https://github.com/rednote-hilab/dots.ocr) for their valuable models and ideas.

 pipeline_tag: image-text-to-text
 library_name: transformers
 ---
 <div align="center">
 # HunyuanOCR
 from PIL import Image
 import torch
+def clean_repeated_substrings(text):
+    """Clean repeated substrings in text"""
+    n = len(text)
+    if n<8000:
+        return text
+    for length in range(2, n // 10 + 1):
+        candidate = text[-length:]
+        count = 0
+        i = n - length
+        while i >= 0 and text[i:i + length] == candidate:
+            count += 1
+            i -= length
+        if count >= 10:
+            return text[:n - length * (count - 1)]
+    return text
 model_name_or_path = "tencent/HunyuanOCR"
 processor = AutoProcessor.from_pretrained(model_name_or_path, use_fast=False)
 img_path = "path/to/your/image.jpg"
 generated_ids_trimmed = [
     out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
 ]
+output_texts = clean_repeated_substrings(processor.batch_decode(
     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+))
 print(output_texts)
 ```
 from transformers import AutoProcessor
 model_path = "tencent/HunyuanOCR"
+llm = LLM(model=model_path, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(model_path)
+sampling_params = SamplingParams(temperature=0, max_tokens=16384)
 img_path = "/path/to/image.jpg"
 img = Image.open(img_path)
 ## 📚 Citation
+```
+@software{hunyuanocr2025,
+  author    = {Tencent Hunyuan Vision Team},
+  title     = {HunyuanOCR Technical Report},
+  year      = {2025},
+  url       = {https://github.com/Tencent-Hunyuan/HunyuanOCR},
+  publisher = {GitHub}
 }
+```
 ## 🙏 Acknowledgements
 We would like to thank [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [MinerU](https://github.com/opendatalab/MinerU), [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR), [dots.ocr](https://github.com/rednote-hilab/dots.ocr) for their valuable models and ideas.