# Import nltk library for natural language processing import nltk import os from transformers import AutoTokenizer def load_nltk(): nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip' if os.path.exists(nltk_file): print('nltk punkt file exists in ', nltk_file) else: print("downloading punkt file") nltk.download('punkt') # Define a function that takes some text as input and returns the number of tokens def token_count(text): # Import the Encoder class from bpe from bpe import Encoder # Create an encoder object with a vocabulary size of 10 encoder = Encoder(vocab_size=14735746) # Train the encoder on the text encoder.fit(text.split()) # Encode the text into tokens tokens = encoder.tokenize(text) # Return the number of tokens return tokens def num_tokens(text): tokenizer = AutoTokenizer.from_pretrained("gpt2") token_ids = tokenizer.encode(text) token_size = len(token_ids) return token_size def num_words(text): sentences = nltk.sent_tokenize(text) # Tokenize each sentence into words using nltk.word_tokenize() words = [] for sentence in sentences: words.extend(nltk.word_tokenize(sentence)) num_words = len(words) return num_words def num_sentences(text): # Tokenize the text into sentences using nltk.sent_tokenize() sentences = nltk.sent_tokenize(text) num_sentences = len(sentences) return num_sentences def num_chars(text): num_characters = len(text) return num_characters # Print out the results # print(f"Number of sentences: {num_sentences}") # print(f"Number of words: {num_words}") # print(f"Number of tokens: {num_tokens}") # print(f"Number of trans_tokens: {trans_tokens}") # print(f"Number of characters: {num_characters}")