# %pip install sentencepiece # %pip install datasets # %pip install seqio import unicodedata import os import nltk from tqdm import tqdm import glob from random import sample def sample_and_make_tempfile(sentences_dir, num_files): """ Use the set of files containing a sentence per line, sample num_files out of those and save as a temp file """ sentence_files = glob.glob(sentences_dir + "/*.txt") # sample num_files sampled_files=sample(sentence_files, num_files) print("sampled files:") print(sampled_files) #read all the lines from sampled files and save to a list all_lines = [] for filename in sampled_files: with open(filename) as f: lines = f.read().splitlines() all_lines.extend(lines) print("number of lines sampled:", len(all_lines)) #combine into a single file and save tempfile_path = os.path.join("text", "temp.txt") with open(tempfile_path, "w") as f: for sentence in tqdm(all_lines): # remove newlines line = sentence.strip() # do not save empty items such as if sentence != []: f.writelines(sentence + '\n') print("Wrote to ", tempfile_path) return tempfile_path def chunks(sentences, n, tot_len): """Yield successive n-sized chunks from sentences.""" for i in range(0, tot_len, n): end_i = min(len(sentences),i + n) yield sentences[i:end_i]["text"] def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'): """ Make a sentence per line files, chuncsize sentences per file""" # make sure data dir exists if not os.path.exists(data_dir): os.makedirs(data_dir) # use simple regex for sentence tokenizing sent_detector = nltk.RegexpTokenizer(u'[^ !?。]*[!?。.\n]') # loop over the chunks for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))): # new file for each chunk filename = "sent_{}.txt".format(chunk_ind) filepath = os.path.join(data_dir, filename) print("writing to ", filepath) with open(filepath, "w") as f: for sentence in tqdm(sentence_chunk): # remove newlines line = sentence.strip() # unicode normalize japanese spaces etc unicodedata.normalize('NFKC', line) # tokenize into sentences sentences = sent_detector.tokenize(line) # do not save empty items such as if sentences != []: f.writelines(s + '\n' for s in sentences) def combine_files(output_file, *files): """ Combines the contents of multiple text files into a single file. :param output_file: Path to the output file. :param files: Paths to the files to be combined. :return: Total number of lines in the combined file. """ total_lines = 0 with open(output_file, 'w') as outfile: for file in files: with open(file, 'r') as infile: lines = infile.readlines() total_lines += len(lines) outfile.writelines(lines) # Add a newline for separation (optional) outfile.write('\n') return total_lines # make sentence files from hugingface dataset dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset") make_sentence_files(dataset_bio["train"]) # combine files to get 45 million sentences files_to_combine = glob.glob("text/sentences/*.txt") files_to_combine = files_to_combine[:2] total_lines = combine_files(output_file_path, *files_to_combine) # Train the sentencepiece transformers on 45 million sentences import sentencepiece as spm spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0, pad_id=0, unk_id=2, eos_id=1, bos_id=-1, user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'], train_extremely_large_corpus=True, num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True) # Add 100 extra tokens to the model from seqio import SentencePieceVocabulary import os import tensorflow as tf from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2 def add_100extra(vocab: SentencePieceVocabulary, out_dir: str): tf.io.gfile.makedirs(out_dir) tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model) model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model) tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write( '\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces) ) # vocab = t5.data.get_default_vocabulary() # out_dir = "../vocabulary/cc_all.32000.100extra" # # add_100extra(vocab, out_dir) # # vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100) # out_dir = "../vocabulary/nedd.32000.100extra" # add_100extra(vocab, out_dir) # # vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128) # out_dir = "../vocabulary/nedd.32000.128extra" # add_100extra(vocab, out_dir) # vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100) out_dir = "conv" add_100extra(vocab, out_dir)