File size: 721 Bytes
b27a404 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Set the vocabulary size and maximum sequence length
voc_size = 10000
sent_length = 20
# Define the corpus
corpus = ['example text 1', 'example text 2', 'example text 3']
# Generate hashed integer sequences for the corpus
hashed_docs = []
for text in corpus:
hashed_doc = [tf.strings.to_hash_bucket_fast(word, voc_size) for word in text.split()]
for word in text.split():
print(int(tf.strings.to_hash_bucket_fast(word, voc_size)), end=' ')
hashed_docs.append(hashed_doc)
# Pad the sequences to a fixed length
padded_docs = pad_sequences(hashed_docs, padding='pre', maxlen=sent_length)
print(padded_docs)
|