File size: 3,403 Bytes
6df828c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
from PIL import Image

import requests
from transformers import CLIPProcessor, CLIPModel

from embeddings import logger

with open("hf_api.key") as f:
    HF_TOKEN = f.read().strip()


class HuggingFaceHosted:
    def __init__(self, model_id, api_token, verbose=False):
        self.model_id = model_id
        self.api_token = api_token
        self.verbose = verbose

    def query(self, data):
        headers = {"Authorization": f"Bearer {self.api_token}"}
        API_URL = f"https://api-inference.huggingface.co/models/{self.model_id}"
        response = requests.request("POST", API_URL, headers=headers, data=data)
        return json.loads(response.content.decode("utf-8"))

    def fill_mask(self, text):
        data = json.dumps({"inputs": text})
        return self.query(data)

    def text_generation(self, text, **parameters):
        payload = {
            "inputs": text,
            "parameters": parameters,
        }
        if self.verbose:
            logger.info(payload)
        data = json.dumps(payload)
        return self.query(data)

    def summarization(self, text, do_sample=False):
        data = json.dumps({"inputs": text, "parameters": {"do_sample": do_sample}})
        return self.query(data)

    def question_answering(self, question, context):
        data = json.dumps(
            {
                "inputs": {
                    "question": question,
                    "context": context,
                }
            }
        )
        return self.query(data)


class CLIP:
    def __init__(self, model_id="openai/clip-vit-large-patch14"):
        self.model_id = model_id
        self.model = CLIPModel.from_pretrained(model_id)
        self.processor = CLIPProcessor.from_pretrained(model_id)

    def get_image_emb(self, image):
        if isinstance(image, str):
            image = Image.open(image)
        image_inputs = self.processor(images=image, return_tensors="pt", padding=True)
        out = self.model.get_image_features(**image_inputs)

        return out.detach().numpy()

    def get_text_emb(self, text):
        text_inputs = self.processor(text=text, return_tensors="pt", padding=True)
        out = self.model.get_text_features(**text_inputs)

        return out.detach().numpy()

    def __repr__(self):
        return f"CLIP Local <{self.model_id}>"


class GPTJ(HuggingFaceHosted):
    def __init__(
        self, model_id="EleutherAI/gpt-j-6B", api_token=HF_TOKEN, verbose=False
    ):
        super().__init__(model_id, api_token, verbose=verbose)

    def __call__(self, text, **parameters):
        return self.text_generation(text, **parameters)

    def __repr__(self):
        return f"GPTJ Hosted <{self.model_id}>"


class MaskEncoder(HuggingFaceHosted):
    def __init__(self, model_id="roberta-large", api_token=HF_TOKEN, verbose=False):
        super().__init__(model_id, api_token, verbose=verbose)

    def __call__(self, text):
        return self.fill_mask(text)

    def __repr__(self):
        return f"MaskEncoder Hosted <{self.model_id}>"


class T2T(HuggingFaceHosted):
    def __init__(self, model_id="bigscience/T0pp", api_token=HF_TOKEN, verbose=False):
        super().__init__(model_id, api_token, verbose=verbose)

    def __call__(self, text, **parameters):
        return self.text_generation(text, **parameters)

    def __repr__(self):
        return f"T2T Hosted <{self.model_id}>"