File size: 3,273 Bytes
b7c468b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler


from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

import sys

import pytz
IST = pytz.timezone('Asia/Kolkata')
print(datetime.datetime.now(IST).strftime("%c"))

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium

# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")

model = model.to(device)

print('Model loaded to GPU')
print(datetime.datetime.now(IST).strftime("%c"))

output_dir = '/media/data_dump/Ritwik/ggpt/model_save/pytorch_save_files/'

print('Loading fine-tuned weights')
model = model.from_pretrained(output_dir).to(device)
tokenizer = tokenizer.from_pretrained(output_dir)

print('Model and tokenizer loaded!')
print(datetime.datetime.now(IST).strftime("%c"))

model.eval()

# prompt_list = ['<|startoftext|> Regarding Kashmir I am very confident to say that','<|startoftext|> I wanted to save bhagat singh but','<|startoftext|> I wanted to save bhagat singh but fortunately','<|startoftext|> I wanted to save bhagat singh but unfortunately','<|startoftext|> Reporter: What is your biggest fear? Gandhi:','<|startoftext|> Question) What is your biggest fear?','<|startoftext|> Regarding Muslims and Islam I strongly believe that','<|startoftext|> I wish to say that','<|startoftext|> I chose Nehru over Patel for Prime Minister because','<|startoftext|> During my experiments with truth I observed that','<|startoftext|> My opinion on the negroes of Africa is that']
prompt_list = ['<|startoftext|> Regarding Kashmir I am very confident to say that']

for prompt in prompt_list:

  # prompt = "<|startoftext|> Regarding Kashmir I am very confident to say that"

  print(prompt)

  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)

  print(generated)

  sample_outputs = model.generate(
                                  generated, 
                                  # bos_token_id=random.randint(1,30000),
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 500,
                                  top_p=0.95, 
                                  num_return_sequences=3
                                  )

  for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

  print(datetime.datetime.now(IST).strftime("%c"))
  print('\n')