|
import tensorflow as tf |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
|
|
|
voc_size = 10000 |
|
sent_length = 20 |
|
|
|
|
|
corpus = ['example text 1', 'example text 2', 'example text 3'] |
|
|
|
|
|
hashed_docs = [] |
|
for text in corpus: |
|
hashed_doc = [tf.strings.to_hash_bucket_fast(word, voc_size) for word in text.split()] |
|
for word in text.split(): |
|
print(int(tf.strings.to_hash_bucket_fast(word, voc_size)), end=' ') |
|
hashed_docs.append(hashed_doc) |
|
|
|
|
|
padded_docs = pad_sequences(hashed_docs, padding='pre', maxlen=sent_length) |
|
|
|
print(padded_docs) |
|
|