Spaces:

publish-ai
/

Final-Llama

Paused

App Files Files Community

Final-Llama / app.py

neuralleap

Update app.py

0f741e1 8 months ago

raw history blame contribute delete

No virus

3.8 kB

	import os
	from threading import Thread
	from typing import Iterator
	import time
	import textwrap
	import nltk
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import tiktoken
	import numpy as np
	import multiprocessing


	nltk.download('punkt')
	sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
	num_processes_tone = multiprocessing.cpu_count()

	if torch.cuda.is_available():
	model_id = "daryl149/llama-2-7b-chat-hf"
	model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	tokenizer.use_default_system_prompt = False

	def num_tokens_from_string(string: str, encoding_name='gpt-3.5-turbo'):
	encoding = tiktoken.encoding_for_model(encoding_name)
	num_tokens = len(encoding.encode(string))
	return num_tokens + 7

	def merge_sentences(sentences):
	merged_list = []
	current_sentence = ""
	for sentence in sentences:
	if num_tokens_from_string(current_sentence + sentence) <= 500:
	if current_sentence != "":
	current_sentence += " "
	current_sentence += sentence
	else:
	merged_list.append(current_sentence)
	current_sentence = sentence
	if current_sentence:
	merged_list.append(current_sentence)
	return merged_list

	def split_into_sentences(text):
	sentences = sentence_tokenizer.tokenize(text)
	return sentences


	def corrected_tone(message):
	output_prompts = []
	split_sentences = split_into_sentences(message)
	token_safe_sentences = merge_sentences(split_sentences)
	for message in token_safe_sentences:
	prompt = f"""You are going to act as a storyteller. Read the text below and change the text tone to a new formal tone. Important: Do not change its meaning or content. return only the output."""
	prompt += "\n"
	prompt += message.strip()
	output_prompts.append(prompt)

	with multiprocessing.Pool(processes=num_processes_tone) as pool:
	results = pool.map(langchain_function, prompt_list)

	out_put_text = ' '.join(results)
	return out_put_text

	@spaces.GPU
	def generate(prompt):
	max_new_tokens = 1024
	temperature = 0.6
	top_p = 0.9
	top_k = 50
	repetition_penalty = 1.2

	conversation = []
	conversation.append({"role": "user", "content": prompt})
	input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	{"input_ids": input_ids},
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	top_k=top_k,
	temperature=temperature,
	num_beams=1,
	repetition_penalty=repetition_penalty,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	return "".join(outputs)

	with gr.Blocks(css="style.css") as demo:
	input_prompt = gr.Textbox()
	output = gr.Textbox()
	btn = gr.Button("Generate")
	btn.click(generate, inputs=input_prompt, outputs=output)


	if __name__ == "__main__":
	demo.launch(share=True)