!pip3 install -q tf-nightly
import tensorflow as tf

import numpy as np
from tensorflow.keras import preprocessing
print(tf.__version__)

Download the BigQuery dataset

!gsutil cp gs://tensorflow-blog-rnn/so_posts_4labels_blank_80k.tar.gz .
!tar -xf so_posts_4labels_blank_80k.tar.gz

batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'train', batch_size=batch_size, validation_split=0.2, subset='training', seed=42)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=42)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'test', batch_size=batch_size)

Explore the data

import time

for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(5):
    print(text_batch.numpy()[i])
    print(label_batch.numpy()[i])

Prepare data for training

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_features = 5000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt
text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

Vectorize the data

def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(text_batch.numpy()[i])
    print(label_batch.numpy()[i])

Build the model

from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype='int64')

x = layers.Embedding(max_features + 1, embedding_dim)(inputs)
x = layers.Bidirectional(layers.LSTM(128))(x)
predictions = layers.Dense(4, activation='softmax', name='predictions')(x)

model = tf.keras.Model(inputs, predictions)

model.compile(
    loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Train the model

epochs = 5

# Fit the model using the train and test datasets.
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

model.summary()

Evaluate the model

loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)