Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import time | |
from transformers import (GPT2LMHeadModel, GPT2Tokenizer, | |
OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, | |
XLNetLMHeadModel, XLNetTokenizer, | |
TransfoXLLMHeadModel, TransfoXLTokenizer, | |
CTRLLMHeadModel, CTRLTokenizer) | |
from Utils import forward, create_context | |
import torch | |
import torch.nn.functional as F | |
from math import floor | |
import requests | |
import json | |
import os | |
from PPLM import run_model as run_pplm, DISCRIMINATOR_MODELS_PARAMS | |
from GPUHandler import GPUHandler | |
PADDING_TEXT = """With eyes for the most part downcast and, if ever they lighted on a fellow creature, at once and | |
furtively averted, Bernard hastened across the roof. He was like a man pursued, but pursued by enemies he does not | |
wish to see, lest they should seem more hostile even than he had supposed, and he himself be made to feel guiltier | |
and even more helplessly alone. That horrible Benito Hoover!’ And yet the man had meant well enough. Which only made | |
it, in a way, much worse. Those who meant well behaved in the same way as those who meant badly. Even Lenina was making | |
him suffer. He remembered those weeks of timid indecision, during which he had looked and longed and despaired of ever | |
having the courage to ask her. Dared he face the risk of being humiliated by a contemptuous refusal? But if she were to | |
say yes, what rapture! Well, now she had said it and he was still wretched—wretched that she should have thought it | |
such a perfect afternoon for Obstacle Golf, that she should have trotted away to join Henry Foster, that she should | |
have found him funny for not wanting to talk of their most private affairs in public. Wretched, in a word, because she | |
had behaved as any healthy and virtuous English girl ought to behave and not in some other, abnormal, extraordinary | |
way. <eod> </s> <eos>""" | |
try: | |
PID = int(requests.get(url="http://localhost:3000").json()) | |
N_GPU = torch.cuda.device_count() | |
GPU_PER_WORKER = int(os.getenv("GPU_PER_WORKER")) | |
GPU_IDS = list(range(PID * GPU_PER_WORKER, (PID + 1) * GPU_PER_WORKER)) | |
print("Successfully init thread with id {}. The GPU ids attributed are: {}".format(PID, GPU_IDS)) | |
with open(os.getenv("FILE")) as json_file: | |
data = json.load(json_file) | |
models = data["models_to_load"] | |
cached_models = data.get("cached_models") | |
except requests.exceptions.ConnectionError or TypeError: | |
if __name__ == "__main__": | |
PID = 0 | |
N_GPU = torch.cuda.device_count() | |
GPU_PER_WORKER = 1 | |
GPU_IDS = [0] | |
print("Successfully init development thread with id {}. The GPU ids attributed are: {}".format(PID, GPU_IDS)) | |
models = ["pplm"] | |
cached_models = None | |
pass | |
else: | |
raise requests.exceptions.ConnectionError("The PID server is not running.") | |
handler = GPUHandler(int(), models, GPU_IDS, cached_models) | |
models = {} | |
for gpu in handler.gpus: | |
for model in gpu.models: | |
model_name = model["identifier"] | |
print(f"Loading {model_name} model and tokenizer") | |
models[model_name] = model | |
if model.get("cached_path"): | |
print("Loading {} from local path.".format(model_name)) | |
model_checkpoint_path = model["cached_path"] | |
else: | |
model_checkpoint_path = model["checkpoint"] | |
if "configuration_options" in models[model_name]: | |
configuration_options = models[model_name]["configuration_options"] | |
print("Specific configuration options", configuration_options["options"]) | |
config = configuration_options["config"].from_pretrained(model_checkpoint_path) | |
for option_key, option_value in configuration_options["options"].items(): | |
setattr(config, option_key, option_value) | |
models[model_name]["model"] = models[model_name]["model"].from_pretrained(model_checkpoint_path, config=config).to(models[model_name]["device"]) | |
else: | |
models[model_name]["model"] = models[model_name]["model"].from_pretrained(model_checkpoint_path).to(models[model_name]["device"]) | |
models[model_name]["tokenizer"] = models[model_name]["tokenizer"].from_pretrained(models[model_name]["checkpoint"]) | |
models[model_name]["model"].eval() | |
print("All models successfully loaded.") | |
def top_k_top_p_filtering(batch_logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): | |
""" | |
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering | |
:param batch_logits: logits output by the model | |
:param top_k: >0: keep only top k tokens with highest probability (top-k filtering). | |
:param top_p: >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). | |
:param filter_value: | |
:return: A top_p/top_k filtered tensor of logits | |
""" | |
for i in range(batch_logits.size(0)): | |
logits = batch_logits[i] | |
assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear | |
top_k = min(top_k, logits.size(-1)) # Safety check | |
if top_k and top_k > 0: | |
# Remove all tokens with a probability less than the last token of the top-k | |
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] | |
logits[indices_to_remove] = filter_value | |
if top_p and top_p > 0.0: | |
sorted_logits, sorted_indices = torch.sort(logits, descending=True) | |
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) | |
# Remove tokens with cumulative probability above the threshold | |
sorted_indices_to_remove = cumulative_probs > top_p | |
# Shift the indices to the right to keep also the first token above the threshold | |
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() | |
sorted_indices_to_remove[..., 0] = 0 | |
indices_to_remove = sorted_indices[sorted_indices_to_remove] | |
logits[indices_to_remove] = filter_value | |
if 'batched_logits' in locals(): | |
batched_logits = torch.cat((batched_logits, logits.unsqueeze(0)), dim=0) | |
else: | |
batched_logits = logits.unsqueeze(0) | |
return batched_logits | |
def check_tensor_for_eot(output, eot_token, dot_token): | |
return all([(eot_token in output_item or dot_token in output_item) for output_item in output.tolist()]) | |
def truncate_after_eot(output, eot_tokens): | |
result = [] | |
for i in range(output.size(0)): | |
if any([eot_token in output[i] for eot_token in eot_tokens]): | |
item = output[i].tolist() | |
index = find_min_value_in_array(item, eot_tokens) | |
result.append(item[:index] + [eot_tokens[0]]) | |
else: | |
result.append(output[i].tolist()) | |
return result | |
def find_min_value_in_array(array, values): | |
indexes = [] | |
for value in values: | |
try: | |
indexes.append(array.index(value)) | |
except ValueError: | |
"" # Couldn't find value in array | |
return min(indexes) | |
# @lru_cache() | |
def generate_completion( | |
raw_text, | |
length=-1, | |
max_time=-1, | |
model_name="small", | |
temperature=1, | |
max_tokens=256, | |
top_p=0.0, | |
top_k=0, | |
batch_size=3, | |
repetition_penalty=1.2, | |
# PPLM | |
bag_of_words_or_discrim=None, | |
stepsize=0.02, | |
gamma=1.5, | |
num_iterations=3, | |
window_length=5, | |
kl_scale=0.01, | |
gm_scale=0.95, | |
use_sampling=False | |
): | |
start = time.time() | |
try: | |
print("Running with model", model_name) | |
model, tokenizer, device = models[model_name]["model"], models[model_name]["tokenizer"], models[model_name]["device"] | |
except KeyError: | |
print("Error. Defaulting to small model.") | |
model, tokenizer, device = models["gpt2/small"]["model"], models["gpt2/small"]["tokenizer"], models["gpt2/small"]["device"] | |
if "pplm" in model_name: | |
if ":" in bag_of_words_or_discrim: | |
discrim, discrim_label = bag_of_words_or_discrim.split(":") | |
discrim_label = DISCRIMINATOR_MODELS_PARAMS[discrim]["class_id"][int(discrim_label)] | |
bag_of_words = None | |
# Hardcoded parameters for the discriminator | |
gamma = 1.0 | |
print("Running PPLM with discriminator:", discrim, discrim_label) | |
else: | |
bag_of_words = bag_of_words_or_discrim | |
discrim = None | |
discrim_label = None | |
# Hardcoded parameters for the BOW | |
gamma = 1.5 | |
window_length = 5 | |
print("Running PPLM with bag of words:", bag_of_words) | |
print("kl", kl_scale, "gm", gm_scale, "sampling", use_sampling, "window length", window_length, "gamma", gamma, "temperature", temperature) | |
return run_pplm( | |
model, tokenizer, device, raw_text, | |
max_time=max_time, | |
discrim=discrim, | |
discrim_label=discrim_label, | |
num_samples=batch_size, | |
bag_of_words=bag_of_words, | |
length=length, | |
temperature=temperature, | |
top_k=top_k, | |
stepsize=stepsize, | |
gamma=gamma, | |
num_iterations=num_iterations, | |
window_length=window_length, | |
kl_scale=kl_scale, | |
gm_scale=gm_scale, | |
use_sampling=use_sampling | |
) | |
context_tokens, eot_token, dot_token = create_context(model_name, tokenizer, raw_text, PADDING_TEXT, max_tokens=max_tokens) | |
if length == -1: | |
length = 100 | |
context = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1) | |
prev = context | |
past = None | |
with torch.no_grad(): | |
for _ in range(length): | |
try: | |
output = forward(model_name, model, prev, past, device=device) | |
except RuntimeError: | |
return "ERROR 500: OOM. TransfoXL asked for too much memory." | |
logits, past = output if len(output) > 2 else output[0], None | |
logits = logits[:, -1, :] / max(temperature, 0.001) | |
if "ctrl" in model_name: | |
for i in range(batch_size): | |
for j in set(prev[i].tolist()): | |
logits[i, j] /= repetition_penalty | |
logits = top_k_top_p_filtering(logits, top_p=top_p, top_k=top_k) | |
log_probs = F.softmax(logits, dim=-1) | |
token = torch.multinomial(log_probs, num_samples=1) | |
prev = torch.cat((prev, token), dim=1) | |
# Check that there is no eot token in all of the sentence, else breaks. | |
if check_tensor_for_eot(prev[:, len(context_tokens):], eot_token, dot_token) or (max_time != -1 and time.time() - start + 0.1 > max_time): | |
break | |
out = prev[:, len(context_tokens):] | |
# Remove the words following the eot tokens. | |
out = truncate_after_eot(out, list(filter(lambda t: t is not None, [dot_token, eot_token]))) | |
end = time.time() | |
# Remove empty sentences and duplicates | |
generations = list(set(filter(lambda x: len(x) > 0, [" " + tokenizer.decode(single_generation).strip() for single_generation in out]))) | |
sentences = [ | |
{"value": generations[i], "time": end - start, "tokens": len(out[i])} for i in range(len(generations)) | |
] | |
# print(end - start, [len(out[i]) for i in range(len(generations))]) | |
return sentences | |
if __name__ == "__main__": | |
print(generate_completion( | |
"My dog died", | |
length=30, model_name="pplm", batch_size=3, top_k=10, top_p=0.9, | |
bag_of_words_or_discrim="sentiment:2", | |
stepsize=0.03, | |
gamma=1, | |
num_iterations=3, | |
window_length=5, | |
kl_scale=0.01, | |
gm_scale=0.95, | |
max_time=-1, | |
use_sampling=False | |
)) | |