Spaces:

jalder1989
/

Wizard-Vicuna-30B-Uncensored-WebUI

Paused

App Files Files Community

Wizard-Vicuna-30B-Uncensored-WebUI / modules /RWKV.py

jalder1989

Upload folder using huggingface_hub

f520676 10 months ago

raw

history blame contribute delete

5.75 kB

	'''
	This loader is not currently maintained as RWKV can now be loaded
	through the transformers library.
	'''

	import copy
	import os
	from pathlib import Path

	import numpy as np
	from tokenizers import Tokenizer
	from transformers import is_torch_xpu_available

	import modules.shared as shared
	from modules.callbacks import Iteratorize

	np.set_printoptions(precision=4, suppress=True, linewidth=200)

	os.environ['RWKV_JIT_ON'] = '1'
	os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster)

	from rwkv.model import RWKV
	from rwkv.utils import PIPELINE, PIPELINE_ARGS


	class RWKVModel:
	def __init__(self):
	pass

	@classmethod
	def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"):
	tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
	if shared.args.rwkv_strategy is None:
	model = RWKV(model=str(path), strategy=f'{device} {dtype}')
	else:
	model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy)

	pipeline = PIPELINE(model, str(tokenizer_path))
	result = self()
	result.pipeline = pipeline
	result.model = model
	result.cached_context = ""
	result.cached_model_state = None
	result.cached_output_logits = None
	return result

	def generate(self, prompt, state, callback=None):
	args = PIPELINE_ARGS(
	temperature=state['temperature'],
	top_p=state['top_p'],
	top_k=state['top_k'],
	alpha_frequency=0.1, # Frequency Penalty (as in GPT-3)
	alpha_presence=0.1, # Presence Penalty (as in GPT-3)
	token_ban=[0], # ban the generation of some tokens
	token_stop=[]
	)

	if self.cached_context != "":
	if prompt.startswith(self.cached_context):
	prompt = prompt[len(self.cached_context):]
	else:
	self.cached_context = ""
	self.cached_model_state = None
	self.cached_output_logits = None

	# out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
	out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
	return out

	def generate_with_streaming(self, args, *kwargs):
	with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
	reply = ''
	for token in generator:
	reply += token
	yield reply

	# Similar to the PIPELINE.generate, but lets us maintain the cached_model_state
	def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):
	all_tokens = []
	out_str = ''
	occurrence = {}
	state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None

	# if we ended up with an empty context, just reuse the cached logits
	# this can happen if a user undoes a message and then sends the exact message again
	# in that case the full context ends up being the same as the cached_context, so the remaining context is empty.
	if ctx == "":
	out = self.cached_output_logits

	token = None
	for i in range(token_count):
	# forward
	tokens = self.pipeline.encode(ctx) if i == 0 else [token]
	while len(tokens) > 0:
	out, state = self.model.forward(tokens[:args.chunk_len], state)
	tokens = tokens[args.chunk_len:]
	if i == 0:
	begin_token = len(all_tokens)
	last_token_posi = begin_token
	# cache the model state after scanning the context
	# we don't cache the state after processing our own generated tokens because
	# the output string might be post-processed arbitrarily. Therefore, what's fed into the model
	# on the next round of chat might be slightly different what what it output on the previous round
	if i == 0:
	self.cached_context += ctx
	self.cached_model_state = copy.deepcopy(state)
	self.cached_output_logits = copy.deepcopy(out)

	# adjust probabilities
	for n in args.token_ban:
	out[n] = -float('inf')

	for n in occurrence:
	out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

	# sampler
	token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)
	if token in args.token_stop:
	break

	all_tokens += [token]
	if token not in occurrence:
	occurrence[token] = 1
	else:
	occurrence[token] += 1

	# output
	tmp = self.pipeline.decode(all_tokens[last_token_posi:])
	if '\ufffd' not in tmp: # is valid utf-8 string?
	if callback:
	callback(tmp)

	out_str += tmp
	last_token_posi = begin_token + i + 1
	return out_str


	class RWKVTokenizer:
	def __init__(self):
	pass

	@classmethod
	def from_pretrained(self, path):
	tokenizer_path = path / "20B_tokenizer.json"
	tokenizer = Tokenizer.from_file(str(tokenizer_path))
	result = self()
	result.tokenizer = tokenizer
	return result

	def encode(self, prompt):
	return self.tokenizer.encode(prompt).ids

	def decode(self, ids):
	return self.tokenizer.decode(ids)