Spaces:

Dhruv
/

bioML_resurrection_project

Sleeping

bioML_resurrection_project / LLM_middleware.py

WIP: adding functions for loading and tokenization

9e8bd13 over 1 year ago

791 Bytes

	import os
	import re

	from typing import List, Optional

	import openai
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from datasets import load_datasets

	class LLM_Middleware():
	hf_key: str
	dataset

	def __init__(self, openai_key, hf) -> None:
	openai.key = openai_key
	self.hf_key = hf
	'''
	function for loading the dataset using hf trainer.
	'''
	def loadDataset(self,datasetName: str):
	self.dataset = load_datasets(datasetName)
	return self.dataset


	def TokenizerFunction(modelName: str, dataset):
	tokenizer = AutoTokenizer.from_pretrained(modelName)
	## as its the JSON function, we need to specify other function in order to be specific.
	tokenizer(dataset["text"], padding="max_length", truncation=True)