Spaces:

DAMO-NLP-SG
/

CLEX-Chat

Runtime error

App Files Files Community

CLEX-Chat / app.py

lixin4ever

Update app.py

8d5955c over 1 year ago

raw

history blame contribute delete

8.08 kB

	from threading import Thread
	from typing import Iterator

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	MAX_MAX_NEW_TOKENS = 256
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = 10240

	DESCRIPTION = """\
	# CLEX-7B-Chat-16K

	This Space demonstrates model [CLEX-7B-Chat-16K](https://huggingface.co/DAMO-NLP-SG/CLEX-7B-Chat-16K), a Llama-2-7B model fine-tuned using our [CLEX](https://arxiv.org/abs/2310.16450) method. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).

	The web demo supports the maximum input sequence length of 10k now due to the limit of GPU memory, running the demo locally (with larger GPU memory) is highly recommended.

	This support of PDF input is tentative.

	"""

	# LICENSE = """
	# <p/>

	# ---
	# As a derivate work of [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta,
	# this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
	# """


	CITE = """
	If you find our project useful, hope you can star our repo and cite our paper as follows:
	```
	@article{damonlpsg2023clex,
	author = {Chen, Guanzheng and Li, Xin and Meng, Zaiqiao and Liang, Shangsong and Bing, Lidong},
	title = {CLEX: Continuous Length Extrapolation for Large Language Models},
	year = 2023,
	journal = {arXiv preprint arXiv:2310.16450},
	url = {https://arxiv.org/abs/2310.16450}
	}
	```
	"""

	if not torch.cuda.is_available():
	DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


	# if torch.cuda.is_available():
	model_id = "DAMO-NLP-SG/CLEX-7b-Chat-16K"
	from transformers import AutoModelForCausalLM
	from modeling_llama import LlamaForCausalLM
	# from configuration_clex import CLEXLlamaConfig
	# config = CLEXLlamaConfig.from_pretrained(
	# model_id
	# )
	model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, low_cpu_mem_usage=True, device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
	tokenizer.use_default_system_prompt = False

	import PyPDF2
	from io import BytesIO

	def process_pdf(input_pdf):
	# Read the binary data from the input_pdf
	# pdf_data = BytesIO(input_pdf)
	# if pdf_data.getvalue().strip() == b'':
	# return ""
	# Create a PDF reader object
	reader = PyPDF2.PdfReader(input_pdf.name)
	# Extract the text from each page of the PDF
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	# Close the PDF reader and reset the pointer
	# reader.close()
	# pdf_data.seek(0)
	# Return the extracted text
	return text



	def build_chat():
	from fastchat.model import get_conversation_template
	conv = get_conversation_template("vicuna")
	conv.append_message(conv.roles[0], prompt)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()
	return prompt


	import re

	def replace_repeated_spaces_and_newlines(text):
	# Replace repeated spaces with a single space
	text = re.sub(r'\s+', ' ', text)

	# Replace repeated newlines with a single newline
	text = re.sub(r'\n+', '\n', text)

	return text

	from fastchat.model import get_conversation_template

	@spaces.GPU
	def generate(
	message: str,
	chat_history,
	system_prompt: str,
	input_pdf: BytesIO = None,
	max_new_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 1.0,
	top_k: int = 50,
	repetition_penalty: float = 1.0,
	) -> Iterator[str]:
	if input_pdf is not None:
	pdf_text = process_pdf(input_pdf)
	# print(pdf_text)
	pdf_text = replace_repeated_spaces_and_newlines(pdf_text)
	message += f"\nThis is the beginning of a pdf\n{pdf_text}This is the end of a pdf\n"
	conv = get_conversation_template("vicuna")
	if system_prompt is not None:
	conv.set_system_message(system_prompt)
	conv.append_message(conv.roles[0], message)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()
	# if system_prompt:
	# conversation.append({"role": "system", "content": system_prompt})
	# for user, assistant in chat_history:
	# conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
	# conversation.append({"role": "user", "content": message})
	# print(prompt[500:1000])
	# chat = tokenizer.apply_chat_template(conversation, tokenize=False)
	inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
	if len(inputs) > MAX_INPUT_TOKEN_LENGTH:
	inputs = inputs[-MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)


	# def generate_with_pdf(
	# message: str,
	# chat_history,
	# system_prompt: str,
	# input_pdf: BytesIO = None,
	# max_new_tokens: int = 1024,
	# temperature: float = 0.6,
	# top_p: float = 0.9,
	# top_k: int = 50,
	# repetition_penalty: float = 1.2,
	# ) -> Iterator[str]:
	# if input_pdf is not None:
	# pdf_text = process_pdf(input_pdf)
	# # print(pdf_text)
	# message += f"\nThis is the beginning of a pdf\n{pdf_text}This is the end of a pdf\n"
	# yield from generate(
	# message,
	# chat_history,
	# system_prompt,
	# max_new_tokens,
	# temperature,
	# top_p,
	# top_k,
	# repetition_penalty
	# )

	chat_interface = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Textbox(label="System prompt", lines=6),
	gr.File(label="PDF File", accept=".pdf"),
	gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	),
	gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=4.0,
	step=0.1,
	value=0.7,
	),
	gr.Slider(
	label="Top-p (nucleus sampling)",
	minimum=0.05,
	maximum=1.0,
	step=0.05,
	value=1.0,
	),
	gr.Slider(
	label="Top-k",
	minimum=1,
	maximum=1000,
	step=1,
	value=50,
	),
	gr.Slider(
	label="Repetition penalty",
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	value=1.0,
	),
	],
	stop_btn=None,
	examples=[
	["Hello there! How are you doing?"],
	["Can you explain briefly to me what is the Python programming language?"],
	["Explain the plot of Cinderella in a sentence."],
	["How many hours does it take a man to eat a Helicopter?"],
	["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
	],
	)



	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")

	chat_interface.render()
	gr.Markdown(CITE)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch(share=False)