Recomended GPU size

by vladciocan88 - opened 9 days ago

9 days ago

•

Hi, I'm trying to run this model on a 3090 but i run into OOM, can you provide a simple inference script to compare with mine?
I know i`m doing something wrong but dont know what.

processor = AutoProcessor.from_pretrained("RolmOCR")
model = AutoModelForVision2Seq.from_pretrained("RolmOCR")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def run_ocr(image_path, context):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, text=context.strip() or "Return the plain text representation of this document as if you were reading it naturally.", return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=2048)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]



@app
	.post("/predict")
async def predict(files: List[UploadFile] = File(...), text: str = Form("")):
    results = []
    for file in files:
        temp_file = UPLOAD_DIR / file.filename
        with temp_file.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        extracted_text = run_ocr(temp_file, text)
        results.append({"filename": file.filename, "extracted_text": extracted_text})
    return JSONResponse(content={"context": text, "results": results})

supercharge19

9 days ago

also depends on the image size, or did you try uploading a video? context window looks good for text messages I believe you can increase it to 8K for text only, but for images I am not sure again depends on image size, try less than 1MB image first. 3090 has ample (24GB VRAM) so it should be good enough for even about 4-5MB images.

yifei-reducto

Reducto org 8 days ago

@vladciocan88
You should be able to fit the model in 24GB vram. Sometimes the input image consumes a lot of tokens. Please see https://github.com/QwenLM/Qwen2.5-VL?tab=readme-ov-file#image-resolution-for-performance-boost and tweak the input resolution. For normal document pages, 1000px width or height should be enough for OCR.
Also, using flash-attn also helps you reduce vram usage.
Please let us know how it works. Cheers!

vladciocan88

8 days ago

•

edited 8 days ago

The problem is, i run out of memory before running inference.
Here is my complete code and stacktrace.
Code:

from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pathlib import Path
from typing import List
import shutil
import uvicorn
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

UPLOAD_DIR = Path("uploads")
UPLOAD_DIR.mkdir(exist_ok=True)
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained("RolmOCR", min_pixels=min_pixels, max_pixels=max_pixels)
model = AutoModelForVision2Seq.from_pretrained("RolmOCR", torch_dtype="auto", device_map="auto")

app.mount("/uploads", StaticFiles(directory=UPLOAD_DIR), name="uploads")

def run_ocr(image_path, context):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, text=context.strip() or "Return the plain text representation of this document as if you were reading it naturally.", return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=2048)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]



@app
	.post("/predict")
async def predict(files: List[UploadFile] = File(...), text: str = Form("")):
    results = []
    for file in files:
        temp_file = UPLOAD_DIR / file.filename
        with temp_file.open("wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        extracted_text = run_ocr(temp_file, text)
        results.append({"filename": file.filename, "extracted_text": extracted_text})
    return JSONResponse(content={"context": text, "results": results})

if __name__ == "__main__":
    uvicorn.run("main:app", host="0.0.0.0", port=8000)

StackTrace:

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.91s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.02s/it]
Traceback (most recent call last):
  File "E:\AI-Stuff\main.py", line 82, in <module>
    uvicorn.run("main:app", host="0.0.0.0", port=8000)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\main.py", line 579, in run
    server.run()
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\asyncio\runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\asyncio\base_events.py", line 649, in run_until_complete
    return future.result()
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\server.py", line 70, in serve
    await self._serve(sockets)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\server.py", line 77, in _serve
    config.load()
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\config.py", line 435, in load
    self.loaded_app = import_from_string(self.app)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\uvicorn\importer.py", line 19, in import_from_string
    module = importlib.import_module(module_str)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "E:\AI-Stuff\RoOCR\main.py", line 29, in <module>
    model.to(device)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\transformers\modeling_utils.py", line 3712, in to
    return super().to(*args, **kwargs)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 1344, in to
    return self._apply(convert)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 904, in _apply
    module._apply(fn)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 904, in _apply
    module._apply(fn)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 904, in _apply
    module._apply(fn)
  [Previous line repeated 2 more times]
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 931, in _apply
    param_applied = fn(param)
  File "C:\Users\devvl\AppData\Local\conda\conda\envs\paddle\lib\site-packages\torch\nn\modules\module.py", line 1330, in convert
    return t.to(
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 24.00 GiB of which 0 bytes is free. Of the allocated memory 57.77 GiB is allocated by PyTorch, and 683.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Your need to confirm your account before you can post a new comment.

· Sign up or log in to comment