Spaces:
Runtime error
Runtime error
# Import nltk library for natural language processing | |
import nltk | |
import os | |
from transformers import AutoTokenizer | |
def load_nltk(): | |
nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip' | |
if os.path.exists(nltk_file): | |
print('nltk punkt file exists in ', nltk_file) | |
else: | |
print("downloading punkt file") | |
nltk.download('punkt') | |
# Define a function that takes some text as input and returns the number of tokens | |
def token_count(text): | |
# Import the Encoder class from bpe | |
from bpe import Encoder | |
# Create an encoder object with a vocabulary size of 10 | |
encoder = Encoder(vocab_size=14735746) | |
# Train the encoder on the text | |
encoder.fit(text.split()) | |
# Encode the text into tokens | |
tokens = encoder.tokenize(text) | |
# Return the number of tokens | |
return tokens | |
def num_tokens(text): | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
token_ids = tokenizer.encode(text) | |
token_size = len(token_ids) | |
return token_size | |
def num_words(text): | |
sentences = nltk.sent_tokenize(text) | |
# Tokenize each sentence into words using nltk.word_tokenize() | |
words = [] | |
for sentence in sentences: | |
words.extend(nltk.word_tokenize(sentence)) | |
num_words = len(words) | |
return num_words | |
def num_sentences(text): | |
# Tokenize the text into sentences using nltk.sent_tokenize() | |
sentences = nltk.sent_tokenize(text) | |
num_sentences = len(sentences) | |
return num_sentences | |
def num_chars(text): | |
num_characters = len(text) | |
return num_characters | |
# Print out the results | |
# print(f"Number of sentences: {num_sentences}") | |
# print(f"Number of words: {num_words}") | |
# print(f"Number of tokens: {num_tokens}") | |
# print(f"Number of trans_tokens: {trans_tokens}") | |
# print(f"Number of characters: {num_characters}") |