allenai
/

olmOCR-2-7B-1025-FP8

text-generation-inference

compressed-tensors

Model card Files Files and versions

jakep-allenai commited on Oct 6

Commit

7ed6733

·

verified ·

1 Parent(s): 24221ab

Update README.md

Files changed (1) hide show

README.md +5 -5

README.md CHANGED Viewed

@@ -93,19 +93,19 @@ import urllib.request
 from io import BytesIO
 from PIL import Image
-from transformers import AutoProcessor, Qwen2_5VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
 # Initialize the model
-model = Qwen2_5VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-1025", torch_dtype=torch.bfloat16).eval()
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 # Grab a sample PDF
-urllib.request.urlretrieve("https://molmo.allenai.org/paper.pdf", "./paper.pdf")
 # Render page 1 to an image
 image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
@@ -138,7 +138,7 @@ inputs = {key: value.to(device) for (key, value) in inputs.items()}
 # Generate the output
 output = model.generate(
             **inputs,
-            temperature=0.8,
             max_new_tokens=50,
             num_return_sequences=1,
             do_sample=True,
@@ -152,7 +152,7 @@ text_output = processor.tokenizer.batch_decode(
 )
 print(text_output)
-# ['{"primary_language":"en","is_rotation_valid":true,"rotation_correction":0,"is_table":false,"is_diagram":false,"natural_text":"Molmo and PixMo:\\nOpen Weights and Open Data\\nfor State-of-the']
 ```
 ## License and use

 from io import BytesIO
 from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
 # Initialize the model
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-1025", torch_dtype=torch.bfloat16).eval()
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 # Grab a sample PDF
+urllib.request.urlretrieve("https://olmocr.allenai.org/papers/olmocr.pdf", "./paper.pdf")
 # Render page 1 to an image
 image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1288)
 # Generate the output
 output = model.generate(
             **inputs,
+            temperature=0.1,
             max_new_tokens=50,
             num_return_sequences=1,
             do_sample=True,
 )
 print(text_output)
+# ['---\nprimary_language: en\nis_rotation_valid: True\nrotation_correction: 0\nis_table: False\nis_diagram: False\n---\nolmOCR: Unlocking Trillions of Tokens in PDFs with Vision Language Models\n\nJake Poz']
 ```
 ## License and use