File size: 4,220 Bytes
d47b6b4
 
 
 
 
 
 
 
 
 
 
 
 
 
0cdc4a5
d47b6b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d27c7e7
d47b6b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d27c7e7
d47b6b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d27c7e7
d47b6b4
 
d27c7e7
d47b6b4
d27c7e7
d47b6b4
d27c7e7
d47b6b4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr
import numpy as np
from PIL import Image
from matplotlib import cm
import torch
from transformers import AutoTokenizer, AutoModel
from model import ImageModel, TextModel
import torch.nn.functional as F
import torchvision.transforms.v2 as transforms

# Load model directly
MODEL_NAME = "distilbert/distilroberta-base"
class_names = ['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Romance', 'Sci-Fi']
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
cp = torch.load(r"model_only.pt", map_location="cpu")
model_img = ImageModel(len(class_names))
model_img.load_state_dict(cp['w_i'])
model_text = TextModel(MODEL_NAME, len(class_names))
model_text.load_state_dict(cp['w_t'])

image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])
def text_predictor(title, synopsis):
    encoded_synopsis = tokenizer(f"{title} </s> {synopsis}", \
        add_special_tokens = True, \
        max_length = 128, \
        padding = "max_length", \
        truncation = True,
        return_tensors='pt')
    
    with torch.no_grad():
        score, isAward, genres = model_text((encoded_synopsis['input_ids'], encoded_synopsis['attention_mask']))
        score, isAward, genres = score.squeeze(0), F.sigmoid(isAward.squeeze(0)) >= 0.5 , F.sigmoid(genres.squeeze(0))
        
    preds_name = []
    for prob, cls in zip(genres, class_names):
        if prob >= 0.5:
            preds_name.append(cls)
    return round(score.item(), 2), isAward.item(), {"genres":preds_name}
    
def img_predictor(img):
    # Preprocess the image
    img = Image.fromarray(img.astype('uint8'), 'RGB')  # Convert NumPy array to PIL Image
    img = image_transforms(img).unsqueeze(0)  # Apply transforms and add batch dimension

    # Make predictions
    with torch.no_grad():
        output = model_img(img)
        score, isAward, genres = output[0].squeeze(0), F.sigmoid(output[1].squeeze(0)) >= 0.5, F.sigmoid(output[2].squeeze(0))

    preds_name = []
    for prob, cls in zip(genres, class_names):
        if prob >= 0.5:
            preds_name.append(cls)

    return round(score.item(), 2), isAward.item(), {"genres": preds_name}


def combine_predictor(title, synopsis, img):
    encoded_synopsis = tokenizer(f"{title} </s> {synopsis}", \
        add_special_tokens = True, \
        max_length = 128, \
        padding = "max_length", \
        truncation = True,
        return_tensors='pt')
    
    img = Image.fromarray(img.astype('uint8'), 'RGB')  # Convert NumPy array to PIL Image
    img = image_transforms(img).unsqueeze(0)  # Apply transforms and add batch dimension

    # Make predictions
    with torch.no_grad():
        output_text = model_text((encoded_synopsis['input_ids'], encoded_synopsis['attention_mask']))
        output_img = model_img(img)
        
        score = (output_img[0].squeeze(0) + output_text[0].squeeze(0))/2
        isAward = F.sigmoid((output_img[1].squeeze(0) + output_text[1].squeeze(0))/2) >= 0.5
        genres = F.sigmoid((output_img[2].squeeze(0) + output_text[2].squeeze(0))/2)
    preds_name = []
    for prob, cls in zip(genres, class_names):
        if prob >= 0.5:
            preds_name.append(cls)
            
    return round(score.item(), 2), isAward.item(), {"genres": preds_name}
    
# iface_1 = gr.Interface(age_predictor_image, gr.Image(height=256, width=256), "json", examples=[["young.webp"], ["old.jpg"]])
iface_1 = gr.Interface(text_predictor, [gr.Text(placeholder="Input title here"), gr.Text(placeholder="Input synopsis here")], [gr.Label(label='Score'), gr.Label(label='Is Winning Award?'), "json"])

iface_2 = gr.Interface(img_predictor, gr.Image(height=224, width=224), [gr.Label(label='Score'), gr.Label(label='Is Winning Award?'), "json"])

iface_3 = gr.Interface(combine_predictor, [gr.Text(placeholder="Input title here"), gr.Text(placeholder="Input synopsis here"), gr.Image(height=224, width=224)], [gr.Label(label='Score'), gr.Label(label='Is Winning Award?'), "json"])
demo = gr.TabbedInterface([iface_1, iface_2, iface_3], ["From Text", "From Image", "From Text and Image"])
demo.launch()  # Launches the mini app!