Transformers
PyTorch
JAX
English
t5
text2text-generation
biomedical
clinical
ul2
encoder-decoder
pretraining
medical
text-generation-inference
Instructions to use Siddharth63/pubmedul2_small with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Siddharth63/pubmedul2_small with Transformers:
# Load model directly from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("Siddharth63/pubmedul2_small") model = AutoModelForSeq2SeqLM.from_pretrained("Siddharth63/pubmedul2_small") - Notebooks
- Google Colab
- Kaggle
| # %pip install sentencepiece | |
| # %pip install datasets | |
| # %pip install seqio | |
| import unicodedata | |
| import os | |
| import nltk | |
| from tqdm import tqdm | |
| import glob | |
| from random import sample | |
| def sample_and_make_tempfile(sentences_dir, num_files): | |
| """ Use the set of files containing a sentence per line, | |
| sample num_files out of those and save as a temp file """ | |
| sentence_files = glob.glob(sentences_dir + "/*.txt") | |
| # sample num_files | |
| sampled_files=sample(sentence_files, num_files) | |
| print("sampled files:") | |
| print(sampled_files) | |
| #read all the lines from sampled files and save to a list | |
| all_lines = [] | |
| for filename in sampled_files: | |
| with open(filename) as f: | |
| lines = f.read().splitlines() | |
| all_lines.extend(lines) | |
| print("number of lines sampled:", len(all_lines)) | |
| #combine into a single file and save | |
| tempfile_path = os.path.join("text", "temp.txt") | |
| with open(tempfile_path, "w") as f: | |
| for sentence in tqdm(all_lines): | |
| # remove newlines | |
| line = sentence.strip() | |
| # do not save empty items such as | |
| if sentence != []: | |
| f.writelines(sentence + '\n') | |
| print("Wrote to ", tempfile_path) | |
| return tempfile_path | |
| def chunks(sentences, n, tot_len): | |
| """Yield successive n-sized chunks from sentences.""" | |
| for i in range(0, tot_len, n): | |
| end_i = min(len(sentences),i + n) | |
| yield sentences[i:end_i]["text"] | |
| def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'): | |
| """ | |
| Make a sentence per line files, chuncsize sentences per file""" | |
| # make sure data dir exists | |
| if not os.path.exists(data_dir): | |
| os.makedirs(data_dir) | |
| # use simple regex for sentence tokenizing | |
| sent_detector = nltk.RegexpTokenizer(u'[^ !?。]*[!?。.\n]') | |
| # loop over the chunks | |
| for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))): | |
| # new file for each chunk | |
| filename = "sent_{}.txt".format(chunk_ind) | |
| filepath = os.path.join(data_dir, filename) | |
| print("writing to ", filepath) | |
| with open(filepath, "w") as f: | |
| for sentence in tqdm(sentence_chunk): | |
| # remove newlines | |
| line = sentence.strip() | |
| # unicode normalize japanese spaces etc | |
| unicodedata.normalize('NFKC', line) | |
| # tokenize into sentences | |
| sentences = sent_detector.tokenize(line) | |
| # do not save empty items such as | |
| if sentences != []: | |
| f.writelines(s + '\n' for s in sentences) | |
| def combine_files(output_file, *files): | |
| """ | |
| Combines the contents of multiple text files into a single file. | |
| :param output_file: Path to the output file. | |
| :param files: Paths to the files to be combined. | |
| :return: Total number of lines in the combined file. | |
| """ | |
| total_lines = 0 | |
| with open(output_file, 'w') as outfile: | |
| for file in files: | |
| with open(file, 'r') as infile: | |
| lines = infile.readlines() | |
| total_lines += len(lines) | |
| outfile.writelines(lines) | |
| # Add a newline for separation (optional) | |
| outfile.write('\n') | |
| return total_lines | |
| # make sentence files from hugingface dataset | |
| dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset") | |
| make_sentence_files(dataset_bio["train"]) | |
| # combine files to get 45 million sentences | |
| files_to_combine = glob.glob("text/sentences/*.txt") | |
| files_to_combine = files_to_combine[:2] | |
| total_lines = combine_files(output_file_path, *files_to_combine) | |
| # Train the sentencepiece transformers on 45 million sentences | |
| import sentencepiece as spm | |
| spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0, | |
| pad_id=0, unk_id=2, eos_id=1, bos_id=-1, | |
| user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'], | |
| train_extremely_large_corpus=True, | |
| num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True) | |
| # Add 100 extra tokens to the model | |
| from seqio import SentencePieceVocabulary | |
| import os | |
| import tensorflow as tf | |
| from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2 | |
| def add_100extra(vocab: SentencePieceVocabulary, out_dir: str): | |
| tf.io.gfile.makedirs(out_dir) | |
| tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model) | |
| model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model) | |
| tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write( | |
| '\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces) | |
| ) | |
| # vocab = t5.data.get_default_vocabulary() | |
| # out_dir = "../vocabulary/cc_all.32000.100extra" | |
| # | |
| # add_100extra(vocab, out_dir) | |
| # | |
| # vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100) | |
| # out_dir = "../vocabulary/nedd.32000.100extra" | |
| # add_100extra(vocab, out_dir) | |
| # | |
| # vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128) | |
| # out_dir = "../vocabulary/nedd.32000.128extra" | |
| # add_100extra(vocab, out_dir) | |
| # | |
| vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100) | |
| out_dir = "conv" | |
| add_100extra(vocab, out_dir) |