loading issue in HF Spaces

#15
by maxjvd - opened

I am not able to run bloom-7b1 through the A10 large, which works with models like falcon-7b. I do not understand why, as this model does not seem much larger than what the A10 large can handle (about 15gb VRAM). The model initializes, but seems to take very long for making inferences.

Any ideas on what I may be doing wrong here?


import gradio as gr
import os
import torch

#-- sanity check on hardware
print(f"Is CUDA available: {torch.cuda.is_available()}")
#-- True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
#-- Nvidia something something

from langchain import PromptTemplate, HuggingFaceHub, LLMChain

#-- possible models
flan = "google/flan-t5-xxl"
falcon_7b = "tiiuae/falcon-7b"
falcon_7b_instruct = "tiiuae/falcon-7b-instruct"
bloom_7b = "bigscience/bloom-7b1"
bloom_7b_instruct = "bigscience/bloomz-7b1-mt"
bloom_650m = "bigscience/bloom-560m"

#-- set args for retrieved model
args = {"temperature":0.0001, "max_length":250}
#-- specify model
llm=HuggingFaceHub(repo_id=bloom_7b, model_kwargs=args)
#-- sanity check
print('LLM loaded!')

#-- variable for input + eventual prompts
template='{question}'
prompt = PromptTemplate(template=template, input_variables=["question"])

#-- init langchain
chain = LLMChain(llm=llm, prompt=prompt)

#-- sanity check
print(chain.run('What is the Sally-Anne test?'))

#-- Run the chain only specifying the input variable.
def answer(question):
return chain.run(question)

#-- init app
demo = gr.Interface(fn=answer, inputs='text',outputs='text',examples=[['Hey how are you']])
demo.launch()

Sign up or log in to comment