Spaces:

zeonai
/

testMultiGPU

Paused

App Files Files Community

testMultiGPU / app.py

Gopal2002

Update app.py

12ebeac verified 8 months ago

raw

history blame contribute delete

2.08 kB

	import streamlit as st
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	import torch,os
	from langchain.llms import HuggingFacePipeline
	from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline,BitsAndBytesConfig

	model_name_or_path = "meta-llama/Llama-2-13b-chat-hf"

	# Count the number of GPUs available
	gpu_count = torch.cuda.device_count()

	# Determine the device to use based on GPU availability and count
	# If more than one GPU is available, use 'auto' to allow the library to choose
	# If only one GPU is available, use 'cuda:0' to specify the first GPU
	# If no GPU is available, use the CPU
	if torch.cuda.is_available() and gpu_count > 1:
	device = 'auto'
	elif torch.cuda.is_available():
	device = 'cuda:0'
	else:
	device = 'cpu'

	tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
	model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
	# quantization_config=bnb_config,
	torch_dtype=torch.float16,
	device_map='auto',)
	print(model.hf_device_map)

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=2500,
	return_full_text=True,
	do_sample=True,
	repetition_penalty=1.15,
	num_return_sequences=1,
	pad_token_id=2,
	model_kwargs={"temperature": 0.3,
	"top_p":0.95,
	"top_k":40,
	"max_new_tokens":2500},
	)
	llm = HuggingFacePipeline(pipeline=pipe)
	template = template = """Prompt: {query}
	Answer: """

	prompt_template = PromptTemplate(
	input_variables=["query"],
	template=template
	)
	#instantiate the chain
	llm_chain = LLMChain(prompt=prompt_template, llm=llm)

	st.title('Test Multi GPU')

	md = st.text_area('Type in your markdown string (without outer quotes)')

	if st.button("Enter"):
	with st.spinner(text="In progress..."):
	resp=llm_chain.invoke(md)['text']
	st.write(resp)