Table of Contents

AI - deep learning text data using TensorFlow and Long Short-Term Memory Network (LSTM)

see also:

Introduction

Code example

initial set up and data input

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
df= pd.read_csv("data/wine-reviews.csv", usecols=['country','description','points','price','variety','winery'])

df.head() #display data
df["label"] = (df.points >= 90).astype(int)
df = df[["description","label"]]

df.head() #display data

#split data
train, val, test = np.split(df.sample(frac=1),[int(0.8*len(df)),int(0.9*len(df))])

#check splits:
len(train), len(val), len(test)

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
  df = dataframe.copy()
  labels = dataframe.pop('label')
  #df = {key:value[:,tf.newaxis] for key, value in dataframe.items()}
  df = df["description"] 
  #ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds
  
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

#view data:
list(train_data)[0]

encode text data

encoder = tf.keras.layers.TextVectorization(max_tokens=2000)
encoder.adapt(train_data.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())
vocab[:200] #display 1st 200

create model

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim = len(encoder.get_vocabulary()),
        output_dim = 32,
        mask_zero=True
    ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')   
        
])

Compile model and train

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=['accuracy'])

#model.evaluate(valid_data)
history = model.fit(train_data, epochs=2, validation_data=(valid_data))

Evaluate with test data

model.evaluate(test_data)

#should have loss: 0.3567 - accuracy: 0.8375