import os import re from typing import List, Optional import openai from transformers import AutoTokenizer, AutoModelForCausalLM from datasets import load_datasets class LLM_Middleware(): hf_key: str dataset def __init__(self, openai_key, hf) -> None: openai.key = openai_key self.hf_key = hf ''' function for loading the dataset using hf trainer. ''' def loadDataset(self,datasetName: str): self.dataset = load_datasets(datasetName) return self.dataset def TokenizerFunction(modelName: str, dataset): tokenizer = AutoTokenizer.from_pretrained(modelName) ## as its the JSON function, we need to specify other function in order to be specific. tokenizer(dataset["text"], padding="max_length", truncation=True)