Model / app.py
DavidHuggingFace1's picture
Update app.py
7b1c689
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
# Preprocess reviews
reviews_path = "data_reviews.txt"
with open(reviews_path, "r") as reviews_raw:
reviews = reviews_raw.readlines()
reviews = [review.replace("TL;DR", " TL;DR ") for review in reviews]
max_length = 200
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
extra_length = len(tokenizer.encode(" TL;DR "))
from peft import LoraConfig, get_peft_model
config = LoraConfig(
r=512,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model = model.to('cuda')
class ReviewDataset(Dataset):
def __init__(self, tokenizer, reviews, max_len):
self.max_len = max_len
self.tokenizer = tokenizer
self.eos = self.tokenizer.eos_token
self.eos_id = self.tokenizer.eos_token_id
self.reviews = reviews
self.result = []
for review in self.reviews:
# Encode the text using tokenizer.encode(). Add EOS at the end
tokenized = self.tokenizer.encode(review + self.eos)
# Padding/truncating the encoded sequence to max_len
padded = self.pad_truncate(tokenized)
# Creating a tensor and adding to the result
self.result.append(torch.tensor(padded))
def __len__(self):
return len(self.result)
def __getitem__(self, item):
return self.result[item]
def pad_truncate(self, name):
name_length = len(name) - extra_length
if name_length < self.max_len:
difference = self.max_len - name_length
result = name + [self.eos_id] * difference
elif name_length > self.max_len:
result = name[:self.max_len + 3]+[self.eos_id]
else:
result = name
return result
dataset = ReviewDataset(tokenizer, reviews, max_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
epochs = 2
for epoch in range(epochs):
for batch in dataloader:
with torch.set_grad_enabled(True):
optimizer.zero_grad()
batch = batch.to('cuda')
output = model(batch, labels=batch)
loss = output.loss
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
import numpy as np
import random
def topk(probs, n=9):
# The scores are initially softmaxed to convert to probabilities
probs = torch.softmax(probs, dim= -1)
# PyTorch has its own topk method, which we use here
tokensProb, topIx = torch.topk(probs, k=n)
# The new selection pool (9 choices) is normalized
tokensProb = tokensProb / torch.sum(tokensProb)
# Send to CPU for numpy handling
tokensProb = tokensProb.cpu().detach().numpy()
# Make a random choice from the pool based on the new prob distribution
choice = np.random.choice(n, 1, p = tokensProb)
tokenId = topIx[choice][0]
return int(tokenId)
def model_infer(model, tokenizer, review, max_length=15):
# Preprocess the init token (task designator)
review_encoded = tokenizer.encode(review)
result = review_encoded
initial_input = torch.tensor(review_encoded).unsqueeze(0).to('cuda')
with torch.set_grad_enabled(False):
# Feed the init token to the model
output = model(initial_input)
# Flatten the logits at the final time step
logits = output.logits[0,-1]
# Make a top-k choice and append to the result
result.append(topk(logits))
# For max_length times:
for _ in range(max_length):
# Feed the current sequence to the model and make a choice
input = torch.tensor(result).unsqueeze(0).to('cuda')
output = model(input)
logits = output.logits[0,-1]
res_id = topk(logits)
# If the chosen token is EOS, return the result
if res_id == tokenizer.eos_token_id:
return tokenizer.decode(result)
else: # Append to the sequence
result.append(res_id)
# IF no EOS is generated, return after the max_len
return tokenizer.decode(result)
import gradio as gr
def summarize(review):
summary = model_infer(model, tokenizer, review + " TL;DR ")
return summary.split(" TL;DR ")[1].strip()
iface = gr.Interface(fn=summarize, inputs="text", outputs="text")
iface.launch()