bioML_resurrection_project / LLM_middleware.py
Dhruv's picture
WIP: adding functions for loading and tokenization
9e8bd13
import os
import re
from typing import List, Optional
import openai
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_datasets
class LLM_Middleware():
hf_key: str
dataset
def __init__(self, openai_key, hf) -> None:
openai.key = openai_key
self.hf_key = hf
'''
function for loading the dataset using hf trainer.
'''
def loadDataset(self,datasetName: str):
self.dataset = load_datasets(datasetName)
return self.dataset
def TokenizerFunction(modelName: str, dataset):
tokenizer = AutoTokenizer.from_pretrained(modelName)
## as its the JSON function, we need to specify other function in order to be specific.
tokenizer(dataset["text"], padding="max_length", truncation=True)