see also:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow_hub as hub
df= pd.read_csv("data/wine-reviews.csv", usecols=['country','description','points','price','variety','winery'])
df.head() #view some of the imported data
#split data into train, test and validation datasets:
train, val, test = np.split(df.sample(frac=1),[int(0.8*len(df)),int(0.9*len(df))])
#check how many are in each dataset:
len(train), len(val), len(test)
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
df = dataframe.copy()
labels = dataframe.pop('label')
#df = {key:value[:,tf.newaxis] for key, value in dataframe.items()}
df = df["description"]
#ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
ds = tf.data.Dataset.from_tensor_slices((df, labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
ds = ds.prefetch(tf.data.AUTOTUNE)
return ds
#convert into tensors:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)
#check tensor content:
list(train_data)[0]
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2" hub_layer = hub.KerasLayer(embedding,dtype=tf.string, trainable=True) '''WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.''' #display embedded data: hub_layer(list(train_data)[0][0]) #ie has converted the text to an array of numbers using nnlm-en-dim50
model = tf.keras.Sequential() model.add(hub_layer) model.add(tf.keras.layers.Dense(16,activation="relu")) #16 is no. neurons) model.add(tf.keras.layers.Dropout(0.4)) #to reduce over-filling model.add(tf.keras.layers.Dense(16,activation="relu")) model.add(tf.keras.layers.Dropout(0.4)) model.add(tf.keras.layers.Dense(1,activation="sigmoid"))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=['accuracy'])
model.evaluate(valid_data)
#now train the model: history = model.fit(train_data, epochs=1, validation_data=(valid_data))
#plot to demonstrate failure of model to improve when used on validation - issue of over-filling
#solution is to add layers of dropout
#this is killing my kernel:!!!
plt.plot(history.history[\'accuracy\'],label="Training acc")
plt.plot(history.history[\'val_accuracy\'],label="Validation acc")
plt.title("Accuracy of model")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend()
plt.show()
model.evaluate(test_data) #this should give loss: 0.4943 - accuracy: 0.7801