Recomended GPU size
Hi, I'm trying to run this model on a 3090 but i run into OOM, can you provide a simple inference script to compare with mine?
I know i`m doing something wrong but dont know what.
processor = AutoProcessor.from_pretrained("RolmOCR")
model = AutoModelForVision2Seq.from_pretrained("RolmOCR")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def run_ocr(image_path, context):
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, text=context.strip() or "Return the plain text representation of this document as if you were reading it naturally.", return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=2048)
return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@app
.post("/predict")
async def predict(files: List[UploadFile] = File(...), text: str = Form("")):
results = []
for file in files:
temp_file = UPLOAD_DIR / file.filename
with temp_file.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
extracted_text = run_ocr(temp_file, text)
results.append({"filename": file.filename, "extracted_text": extracted_text})
return JSONResponse(content={"context": text, "results": results})
also depends on the image size, or did you try uploading a video? context window looks good for text messages I believe you can increase it to 8K for text only, but for images I am not sure again depends on image size, try less than 1MB image first. 3090 has ample (24GB VRAM) so it should be good enough for even about 4-5MB images.
@vladciocan88
You should be able to fit the model in 24GB vram. Sometimes the input image consumes a lot of tokens. Please see https://github.com/QwenLM/Qwen2.5-VL?tab=readme-ov-file#image-resolution-for-performance-boost and tweak the input resolution. For normal document pages, 1000px width or height should be enough for OCR.
Also, using flash-attn also helps you reduce vram usage.
Please let us know how it works. Cheers!
The problem is, i run out of memory before running inference.
Here is my complete code and stacktrace.
Code:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pathlib import Path
from typing import List
import shutil
import uvicorn
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
UPLOAD_DIR = Path("uploads")
UPLOAD_DIR.mkdir(exist_ok=True)
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained("RolmOCR", min_pixels=min_pixels, max_pixels=max_pixels)
model = AutoModelForVision2Seq.from_pretrained("RolmOCR", torch_dtype="auto", device_map="auto")
app.mount("/uploads", StaticFiles(directory=UPLOAD_DIR), name="uploads")
def run_ocr(image_path, context):
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, text=context.strip() or "Return the plain text representation of this document as if you were reading it naturally.", return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_new_tokens=2048)
return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@app
.post("/predict")
async def predict(files: List[UploadFile] = File(...), text: str = Form("")):
results = []
for file in files:
temp_file = UPLOAD_DIR / file.filename
with temp_file.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
extracted_text = run_ocr(temp_file, text)
results.append({"filename": file.filename, "extracted_text": extracted_text})
return JSONResponse(content={"context": text, "results": results})
if __name__ == "__main__":
uvicorn.run("main:app", host="0.0.0.0", port=8000)
StackTrace:
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:07<00:00, 1.91s/it]
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 4/4 [00:04<00:00, 1.02s/it]
Traceback (most recent call last):
File "E:\AI-Stuff\main.py", line 82, in <module>
uvicorn.run("main:app", host="0.0.0.0", port=8000)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\main.py", line 579, in run
server.run()
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\server.py", line 66, in run
return asyncio.run(self.serve(sockets=sockets))
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\asyncio\runners.py", line 44, in run
return loop.run_until_complete(main)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\asyncio\base_events.py", line 649, in run_until_complete
return future.result()
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\server.py", line 70, in serve
await self._serve(sockets)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\server.py", line 77, in _serve
config.load()
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\config.py", line 435, in load
self.loaded_app = import_from_string(self.app)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\importer.py", line 19, in import_from_string
module = importlib.import_module(module_str)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\importlib\__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "E:\AI-Stuff\RoOCR\main.py", line 29, in <module>
model.to(device)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\transformers\modeling_utils.py", line 3712, in to
return super().to(*args, **kwargs)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 1344, in to
return self._apply(convert)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 904, in _apply
module._apply(fn)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 904, in _apply
module._apply(fn)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 904, in _apply
module._apply(fn)
[Previous line repeated 2 more times]
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 931, in _apply
param_applied = fn(param)
File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 1330, in convert
return t.to(
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 24.00 GiB of which 0 bytes is free. Of the allocated memory 57.77 GiB is allocated by PyTorch, and 683.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)