File size: 721 Bytes

b27a404

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set the vocabulary size and maximum sequence length
voc_size = 10000
sent_length = 20

# Define the corpus
corpus = ['example text 1', 'example text 2', 'example text 3']

# Generate hashed integer sequences for the corpus
hashed_docs = []
for text in corpus:
    hashed_doc = [tf.strings.to_hash_bucket_fast(word, voc_size) for word in text.split()]
    for word in text.split():
        print(int(tf.strings.to_hash_bucket_fast(word, voc_size)), end=' ') 
    hashed_docs.append(hashed_doc)

# Pad the sequences to a fixed length
padded_docs = pad_sequences(hashed_docs, padding='pre', maxlen=sent_length)

print(padded_docs)