File size: 791 Bytes
c44d252
 
 
 
 
 
 
9e8bd13
c44d252
 
 
9e8bd13
 
c44d252
 
 
9e8bd13
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
import re

from typing import List, Optional

import openai
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_datasets

class LLM_Middleware():
    hf_key: str
    dataset

    def __init__(self, openai_key, hf) -> None:
        openai.key = openai_key
        self.hf_key = hf
'''
function for loading the dataset using hf trainer.
'''
    def loadDataset(self,datasetName: str):
        self.dataset = load_datasets(datasetName)
        return self.dataset


    def TokenizerFunction(modelName: str, dataset):
        tokenizer = AutoTokenizer.from_pretrained(modelName)
        ## as its the JSON function, we need to specify other function in order to be specific.
        tokenizer(dataset["text"], padding="max_length", truncation=True)