Spaces:

mehrdad-es
/

legalLLM

Runtime error

App Files Files Community

legalLLM / app.py

mehrdad-es

Update app.py

9485361 7 months ago

raw history blame contribute delete

No virus

2.55 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
	from threading import Thread
	def generate_prompt(example: dict) -> str:
	"""Generates a standardized message to prompt the model with an instruction, optional input and a
	'response' field."""

	if example["input"]:
	return (
	"Below is an instruction that describes a task, paired with an input that provides further context. "
	"Write a response that appropriately completes the request.\n\n"
	f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
	)
	return (
	"Below is an instruction that describes a task. "
	"Write a response that appropriately completes the request.\n\n"
	f"### Instruction:\n{example['instruction']}\n\n### Response:"
	)
	tokenizer = AutoTokenizer.from_pretrained("mehrdad-es/legalLLM-hf")
	model = AutoModelForCausalLM.from_pretrained("mehrdad-es/legalLLM-hf", torch_dtype=torch.float16)
	model = model.to('cuda:0')

	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	stop_ids = [30, 0]
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False

	def predict(message, history):
	prompt,userInput = message.split('!!')
	message=generate_prompt({"instruction": prompt, "input": userInput})
	history_transformer_format = history + [[message, ""]]
	stop = StopOnTokens()

	messages = "".join(["".join(["\n<USER>:"+item[0], "\n<ASSISTANT>:"+item[1]]) #curr_system_message +
	for item in history_transformer_format])

	model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=400,
	do_sample=True,
	top_p=0.85,
	top_k=500,
	temperature=0.1,
	num_beams=1,
	stopping_criteria=StoppingCriteriaList([stop])
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	partial_message = ""
	for new_token in streamer:
	if new_token != '<':
	partial_message += new_token
	yield partial_message


	gr.ChatInterface(predict).queue().launch()