Oleg Lavrovsky commited on
Commit
64d30b7
·
unverified ·
1 Parent(s): 98cad23

Mitigate peak memory usage

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
17
 
18
  # Required for access to a gated model
19
  load_dotenv()
20
- hf_token = os.getenv("HUGGING_FACE_TOKEN", None)
21
  if hf_token is not None:
22
  login(token=hf_token)
23
 
@@ -53,8 +53,12 @@ async def lifespan(app: FastAPI):
53
  tokenizer = AutoTokenizer.from_pretrained(model_name)
54
  model = AutoModelForCausalLM.from_pretrained(
55
  model_name,
56
- ).to(device)
57
- logger.info("Model loaded successfully!")
 
 
 
 
58
  except Exception as e:
59
  logger.error(f"Failed to load model: {e}")
60
  raise e
 
17
 
18
  # Required for access to a gated model
19
  load_dotenv()
20
+ hf_token = os.getenv("HF_TOKEN", None)
21
  if hf_token is not None:
22
  login(token=hf_token)
23
 
 
53
  tokenizer = AutoTokenizer.from_pretrained(model_name)
54
  model = AutoModelForCausalLM.from_pretrained(
55
  model_name,
56
+ device_map="auto", # Automatically splits model across CPU/GPU
57
+ low_cpu_mem_usage=True, # Avoids unnecessary CPU memory duplication
58
+ offload_folder="offload", # Temporary offload to disk
59
+ )
60
+ #.to(device)
61
+ logger.info(f"Model loaded successfully! ({device})")
62
  except Exception as e:
63
  logger.error(f"Failed to load model: {e}")
64
  raise e