Text Classification using TextVectorization layer PYTHON ONLY
Multiclass text classification from scratch using the new Keras TextVectorization layer PYTHON ONLY
- Download the BigQuery dataset
- Explore the data
- Prepare data for training
- Vectorize the data
- Build the model
- Train the model
- Evaluate the model
!pip3 install -q tf-nightly
import tensorflow as tf
import numpy as np
from tensorflow.keras import preprocessing
print(tf.__version__)
!gsutil cp gs://tensorflow-blog-rnn/so_posts_4labels_blank_80k.tar.gz .
!tar -xf so_posts_4labels_blank_80k.tar.gz
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'train', batch_size=batch_size, validation_split=0.2, subset='training', seed=42)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
'train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=42)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
'test', batch_size=batch_size)
import time
for text_batch, label_batch in raw_train_ds.take(1):
for i in range(5):
print(text_batch.numpy()[i])
print(label_batch.numpy()[i])
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
max_features = 5000
embedding_dim = 128
sequence_length = 500
vectorize_layer = TextVectorization(
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
# Make a text-only dataset (no labels) and call adapt
text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)
for text_batch, label_batch in train_ds.take(1):
for i in range(5):
print(text_batch.numpy()[i])
print(label_batch.numpy()[i])
from tensorflow.keras import layers
# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(max_features + 1, embedding_dim)(inputs)
x = layers.Bidirectional(layers.LSTM(128))(x)
predictions = layers.Dense(4, activation='softmax', name='predictions')(x)
model = tf.keras.Model(inputs, predictions)
model.compile(
loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
epochs = 5
# Fit the model using the train and test datasets.
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=epochs)
model.summary()
loss, accuracy = model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)