File size: 5,739 Bytes

98c533d

# %pip install sentencepiece
# %pip install datasets
# %pip install seqio

import unicodedata
import os    
import nltk
from tqdm import tqdm
import glob
from random import sample

def sample_and_make_tempfile(sentences_dir, num_files):
    """ Use the set of files containing a sentence per line,
    sample num_files out of those and save as a temp file """

    sentence_files = glob.glob(sentences_dir + "/*.txt")

    # sample num_files
    sampled_files=sample(sentence_files, num_files)

    print("sampled files:")
    print(sampled_files)

    #read all the lines from sampled files and save to a list
    all_lines = []
    for filename in sampled_files:
        with open(filename) as f:
            lines = f.read().splitlines()
            
        all_lines.extend(lines)

    print("number of lines sampled:", len(all_lines))

    #combine into a single file and save
    tempfile_path = os.path.join("text", "temp.txt")
    with open(tempfile_path, "w") as f:

                for sentence in tqdm(all_lines):
                    
                    # remove newlines
                    line = sentence.strip()

                    # do not save empty items such as
                    if sentence != []:

                        f.writelines(sentence + '\n')

    print("Wrote to ", tempfile_path)
    return tempfile_path


def chunks(sentences, n, tot_len):
    """Yield successive n-sized chunks from sentences."""
    for i in range(0, tot_len, n):
        end_i = min(len(sentences),i + n)
        yield sentences[i:end_i]["text"]
        
        

def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'):
    """
    Make a sentence per line files, chuncsize sentences per file"""
    
    # make sure data dir exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    # use simple regex for sentence tokenizing
    sent_detector = nltk.RegexpTokenizer(u'[^　！？。]*[！？。.\n]')
    
    # loop over the chunks
    for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))):
        
        # new file for each chunk
        filename = "sent_{}.txt".format(chunk_ind)
        filepath = os.path.join(data_dir, filename)
        
        print("writing to ", filepath)
                                                
        with open(filepath, "w") as f:

            for sentence in tqdm(sentence_chunk):
                
                # remove newlines
                line = sentence.strip()

                # unicode normalize japanese spaces etc
                unicodedata.normalize('NFKC', line)

                # tokenize into sentences
                sentences = sent_detector.tokenize(line)

                # do not save empty items such as
                if sentences != []:

                    f.writelines(s + '\n' for s in sentences)
                    

def combine_files(output_file, *files):
    """
    Combines the contents of multiple text files into a single file.

    :param output_file: Path to the output file.
    :param files: Paths to the files to be combined.
    :return: Total number of lines in the combined file.
    """
    total_lines = 0

    with open(output_file, 'w') as outfile:
        for file in files:
            with open(file, 'r') as infile:
                lines = infile.readlines()
                total_lines += len(lines)
                outfile.writelines(lines)
                # Add a newline for separation (optional)
                outfile.write('\n')
            
                
    return total_lines

# make sentence files from hugingface dataset
dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset")
make_sentence_files(dataset_bio["train"])

# combine files to get 45 million sentences 
files_to_combine = glob.glob("text/sentences/*.txt")
files_to_combine = files_to_combine[:2]
total_lines = combine_files(output_file_path, *files_to_combine)

# Train the sentencepiece transformers on 45 million sentences
import sentencepiece as spm

spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
                                pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
                                user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'],
                                train_extremely_large_corpus=True,
                                num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)


# Add 100 extra tokens to the model
from seqio import SentencePieceVocabulary
import os
import tensorflow as tf
from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2


def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
    tf.io.gfile.makedirs(out_dir)
    tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)

    model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
    tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
        '\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
    )


# vocab = t5.data.get_default_vocabulary()
# out_dir = "../vocabulary/cc_all.32000.100extra"
#
# add_100extra(vocab, out_dir)
#
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
# out_dir = "../vocabulary/nedd.32000.100extra"
# add_100extra(vocab, out_dir)
#
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
# out_dir = "../vocabulary/nedd.32000.128extra"
# add_100extra(vocab, out_dir)
#


vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
out_dir = "conv"
add_100extra(vocab, out_dir)