Spaces:
Runtime error
Runtime error
Oleg Lavrovsky
commited on
Mitigate peak memory usage
Browse files
app.py
CHANGED
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
|
|
17 |
|
18 |
# Required for access to a gated model
|
19 |
load_dotenv()
|
20 |
-
hf_token = os.getenv("
|
21 |
if hf_token is not None:
|
22 |
login(token=hf_token)
|
23 |
|
@@ -53,8 +53,12 @@ async def lifespan(app: FastAPI):
|
|
53 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
54 |
model = AutoModelForCausalLM.from_pretrained(
|
55 |
model_name,
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
58 |
except Exception as e:
|
59 |
logger.error(f"Failed to load model: {e}")
|
60 |
raise e
|
|
|
17 |
|
18 |
# Required for access to a gated model
|
19 |
load_dotenv()
|
20 |
+
hf_token = os.getenv("HF_TOKEN", None)
|
21 |
if hf_token is not None:
|
22 |
login(token=hf_token)
|
23 |
|
|
|
53 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
54 |
model = AutoModelForCausalLM.from_pretrained(
|
55 |
model_name,
|
56 |
+
device_map="auto", # Automatically splits model across CPU/GPU
|
57 |
+
low_cpu_mem_usage=True, # Avoids unnecessary CPU memory duplication
|
58 |
+
offload_folder="offload", # Temporary offload to disk
|
59 |
+
)
|
60 |
+
#.to(device)
|
61 |
+
logger.info(f"Model loaded successfully! ({device})")
|
62 |
except Exception as e:
|
63 |
logger.error(f"Failed to load model: {e}")
|
64 |
raise e
|