bioML_resurrection_project / LLM_middleware.py
Dhruv's picture
WIP: adding functions for loading and tokenization
9e8bd13
raw
history blame
791 Bytes
import os
import re
from typing import List, Optional
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_datasets
class LLM_Middleware():
hf_key: str
dataset
def __init__(self, openai_key, hf) -> None:
openai.key = openai_key
self.hf_key = hf
'''
function for loading the dataset using hf trainer.
'''
def loadDataset(self,datasetName: str):
self.dataset = load_datasets(datasetName)
return self.dataset
def TokenizerFunction(modelName: str, dataset):
tokenizer = AutoTokenizer.from_pretrained(modelName)
## as its the JSON function, we need to specify other function in order to be specific.
tokenizer(dataset["text"], padding="max_length", truncation=True)