see also:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
df= pd.read_csv("data/wine-reviews.csv", usecols=['country','description','points','price','variety','winery'])
df.head() #display data
df["label"] = (df.points >= 90).astype(int)
df = df[["description","label"]]
df.head() #display data
#split data
train, val, test = np.split(df.sample(frac=1),[int(0.8*len(df)),int(0.9*len(df))])
#check splits:
len(train), len(val), len(test)
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
df = dataframe.copy()
labels = dataframe.pop('label')
#df = {key:value[:,tf.newaxis] for key, value in dataframe.items()}
df = df["description"]
#ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
ds = tf.data.Dataset.from_tensor_slices((df, labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
ds = ds.prefetch(tf.data.AUTOTUNE)
return ds
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)
#view data:
list(train_data)[0]
encoder = tf.keras.layers.TextVectorization(max_tokens=2000) encoder.adapt(train_data.map(lambda text, label: text)) vocab = np.array(encoder.get_vocabulary()) vocab[:200] #display 1st 200
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim = len(encoder.get_vocabulary()),
output_dim = 32,
mask_zero=True
),
tf.keras.layers.LSTM(32),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(0.4),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
#model.evaluate(valid_data)
history = model.fit(train_data, epochs=2, validation_data=(valid_data))
model.evaluate(test_data) #should have loss: 0.3567 - accuracy: 0.8375