{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "IST718_FinalProject_Colab_Keras_Categorical.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "RIKbaZtyhAiO", "colab_type": "text" }, "source": [ "# STEP 1: Import ALL the things\n", "[link to tutorial](https://www.tensorflow.org/tutorials/structured_data/feature_columns)" ] }, { "cell_type": "code", "metadata": { "id": "sTc2g5uFgoF3", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "outputId": "7a3f0888-eb93-4408-f0a0-1a5f7e12221c" }, "source": [ "%tensorflow_version 2.x\n", "from __future__ import absolute_import, division, print_function, unicode_literals\n", "import numpy as np\n", "import pandas as pd\n", "import tensorflow as tf\n", "from tensorflow import feature_column\n", "from tensorflow.keras import layers\n", "from sklearn.model_selection import train_test_split" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "TensorFlow 2.x selected.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "W1tOvP0mgtmp", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 191 }, "outputId": "9ea282c4-1d3f-46d8-c46b-c5dafd48b703" }, "source": [ "url = 'https://storage.googleapis.com/applied-dl/heart.csv'\n", "df = pd.read_csv(url)\n", "df.head()" ], "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathaltarget
063111452331215002.330fixed0
167141602860210811.523normal1
267141202290212912.622reversible0
337131302500018703.530normal0
441021302040217201.410normal0
\n", "
" ], "text/plain": [ " age sex cp trestbps chol ... oldpeak slope ca thal target\n", "0 63 1 1 145 233 ... 2.3 3 0 fixed 0\n", "1 67 1 4 160 286 ... 1.5 2 3 normal 1\n", "2 67 1 4 120 229 ... 2.6 2 2 reversible 0\n", "3 37 1 3 130 250 ... 3.5 3 0 normal 0\n", "4 41 0 2 130 204 ... 1.4 1 0 normal 0\n", "\n", "[5 rows x 14 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "cell_type": "markdown", "metadata": { "id": "QU0C9tHNhWV-", "colab_type": "text" }, "source": [ "# STEP 2: Split into test and train (and validation)" ] }, { "cell_type": "code", "metadata": { "id": "cEZRTp4ng2c_", "colab_type": "code", "colab": {} }, "source": [ "train, test = train_test_split(df, test_size=0.2)\n", "train, val = train_test_split(train, test_size=0.2)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Ns7Ib2x0hdkR", "colab_type": "text" }, "source": [ "# STEP 3: Create an INPUT PIPELINE using `tf.data`" ] }, { "cell_type": "code", "metadata": { "id": "hDAYPP5ChcCZ", "colab_type": "code", "colab": {} }, "source": [ "# A utility method to create a tf.data dataset from a Pandas Dataframe\n", "def df_to_dataset(dataframe, shuffle=True, batch_size=32):\n", " dataframe = dataframe.copy()\n", " labels = dataframe.pop('target')\n", " # WTF IS HAPPENING HERE\n", " ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))\n", " if shuffle:\n", " ds = ds.shuffle(buffer_size=len(dataframe))\n", " ds = ds.batch(batch_size)\n", " return ds\n", "\n", "batch_size = 5 # A small batch sized is used for demonstration purposes\n", "train_ds = df_to_dataset(train, batch_size=batch_size)\n", "val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)\n", "test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "4EIcI7IEhU7T", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "outputId": "51c5d695-bb45-4895-9f24-223b5cd9cc80" }, "source": [ "for feature_batch, label_batch in train_ds.take(1):\n", " print('Every feature:', list(feature_batch.keys()))\n", " print('A batch of ages:', feature_batch['age'])\n", " print('A batch of targets:', label_batch )" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']\n", "A batch of ages: tf.Tensor([65 68 51 52 54], shape=(5,), dtype=int32)\n", "A batch of targets: tf.Tensor([0 0 1 0 1], shape=(5,), dtype=int32)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "YFtV-V34lGXP", "colab_type": "text" }, "source": [ "# STEP 4: Create and Pick Feature Columns" ] }, { "cell_type": "code", "metadata": { "id": "FTFcxZ7yhoL6", "colab_type": "code", "colab": {} }, "source": [ "# We will use this batch to demonstrate several types of feature columns\n", "# A utility method to create a feature column\n", "# and to transform a batch of data\n", "\n", "example_batch = next(iter(train_ds))[0]\n", "\n", "def demo(feature_column):\n", " feature_layer = layers.DenseFeatures(feature_column)\n", " print(feature_layer(example_batch).numpy())" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "bOqD2u_nlehC", "colab_type": "text" }, "source": [ "### NUMERIC COLUMNS" ] }, { "cell_type": "code", "metadata": { "id": "u19XL8Q0iP-6", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 230 }, "outputId": "2ced923a-8186-4a78-fbf9-068d21e04f6c" }, "source": [ "age = feature_column.numeric_column(\"age\")\n", "demo(age)" ], "execution_count": 11, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Layer dense_features is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "[[66.]\n", " [67.]\n", " [53.]\n", " [48.]\n", " [48.]]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "jc5r-WKPlkBD", "colab_type": "text" }, "source": [ "### BUCKETIZED COLUMNS" ] }, { "cell_type": "code", "metadata": { "id": "6_EuEhBFiiW2", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 230 }, "outputId": "050baa9f-b7f9-4c85-831a-5a5648001137" }, "source": [ "age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])\n", "demo(age_buckets)" ], "execution_count": 12, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Layer dense_features_1 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]\n", " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]\n", " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "u5RhLs20lnUl", "colab_type": "text" }, "source": [ "### CATEGORICAL COLUMNS" ] }, { "cell_type": "code", "metadata": { "id": "ZEoaS44Zji9s", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 335 }, "outputId": "cb5adda8-6f4f-4a63-85ad-25e74bf20307" }, "source": [ "thal = feature_column.categorical_column_with_vocabulary_list(\n", " 'thal', ['fixed', 'normal', 'reversible'])\n", "\n", "thal_one_hot = feature_column.indicator_column(thal)\n", "demo(thal_one_hot)" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Layer dense_features_2 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "WARNING:tensorflow:From /tensorflow-2.1.0/python3.6/tensorflow_core/python/feature_column/feature_column_v2.py:4267: IndicatorColumn._variable_shape (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.\n", "WARNING:tensorflow:From /tensorflow-2.1.0/python3.6/tensorflow_core/python/feature_column/feature_column_v2.py:4322: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.\n", "[[1. 0. 0.]\n", " [0. 1. 0.]\n", " [0. 1. 0.]\n", " [0. 0. 1.]\n", " [0. 0. 1.]]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "lo2NnWW8l9UW", "colab_type": "text" }, "source": [ "### EMBEDDING COLUMNS (best for columns with many values)" ] }, { "cell_type": "code", "metadata": { "id": "9yfyh65ImGQy", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 317 }, "outputId": "66c21701-4c43-43f0-b438-dd9865e4cfea" }, "source": [ "thal_embedding = feature_column.embedding_column(thal, dimension=8)\n", "demo(thal_embedding)" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Layer dense_features_3 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "[[ 0.02936197 0.31758815 -0.3563057 0.31574684 -0.421228 0.10793252\n", " -0.13682729 0.2326817 ]\n", " [ 0.3237073 -0.4747037 0.04366031 0.0928458 0.27537555 -0.5144811\n", " 0.07425166 0.20759253]\n", " [ 0.3237073 -0.4747037 0.04366031 0.0928458 0.27537555 -0.5144811\n", " 0.07425166 0.20759253]\n", " [ 0.00689469 0.01643167 0.03457681 0.10944679 0.09321592 0.14874297\n", " -0.09204395 -0.69555455]\n", " [ 0.00689469 0.01643167 0.03457681 0.10944679 0.09321592 0.14874297\n", " -0.09204395 -0.69555455]]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "q6BkjyWYmZsV", "colab_type": "text" }, "source": [ "### HASHED FEATURE COLUMNS" ] }, { "cell_type": "code", "metadata": { "id": "_n4ESZPhmcKi", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "outputId": "d463cb3e-ff28-44bb-9ea2-4afd8ab46871" }, "source": [ "thal_hashed = feature_column.categorical_column_with_hash_bucket(\n", " 'thal', hash_bucket_size=1000)\n", "demo(feature_column.indicator_column(thal_hashed))" ], "execution_count": 16, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Layer dense_features_4 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "WARNING:tensorflow:From /tensorflow-2.1.0/python3.6/tensorflow_core/python/feature_column/feature_column_v2.py:4322: HashedCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.\n", "[[0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "hmERLs6Xmd93", "colab_type": "text" }, "source": [ "### CROSSED FEATURE COLUMNS" ] }, { "cell_type": "code", "metadata": { "id": "jh27uUy0mgN_", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "outputId": "74bf1d46-907c-4b05-d675-5a49560c3973" }, "source": [ "crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)\n", "demo(feature_column.indicator_column(crossed_feature))" ], "execution_count": 17, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Layer dense_features_5 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n", "\n", "If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n", "\n", "To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n", "\n", "WARNING:tensorflow:From /tensorflow-2.1.0/python3.6/tensorflow_core/python/feature_column/feature_column_v2.py:4322: CrossedColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.\n", "[[0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]\n", " [0. 0. 0. ... 0. 0. 0.]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "4bJTAmxDjuAu", "colab_type": "code", "colab": {} }, "source": [ "feature_columns = []\n", "\n", "# numeric cols\n", "for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:\n", " feature_columns.append(feature_column.numeric_column(header))\n", "\n", "# bucketized cols\n", "age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])\n", "feature_columns.append(age_buckets)\n", "\n", "# indicator cols\n", "thal = feature_column.categorical_column_with_vocabulary_list(\n", " 'thal', ['fixed', 'normal', 'reversible'])\n", "thal_one_hot = feature_column.indicator_column(thal)\n", "feature_columns.append(thal_one_hot)\n", "\n", "# embedding cols\n", "thal_embedding = feature_column.embedding_column(thal, dimension=8)\n", "feature_columns.append(thal_embedding)\n", "\n", "# crossed cols\n", "crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)\n", "crossed_feature = feature_column.indicator_column(crossed_feature)\n", "feature_columns.append(crossed_feature)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "fEyM-FfCmvrf", "colab_type": "text" }, "source": [ "# STEP 5: Create a feature LAYER\n", "\n", "After we've defined our feature columns, we will use a `DenseFeatures` layer to input them into our Keras model" ] }, { "cell_type": "code", "metadata": { "id": "2QUKJhYTk8OB", "colab_type": "code", "colab": {} }, "source": [ "feature_layer = tf.keras.layers.DenseFeatures(feature_columns)\n", "batch_size = 32\n", "train_ds = df_to_dataset(train, batch_size=batch_size)\n", "val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)\n", "test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "2uvzMxPhnBB2", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 227 }, "outputId": "46c5802b-1a3d-4722-b730-25948d052654" }, "source": [ "model = tf.keras.Sequential([\n", " feature_layer,\n", " layers.Dense(128, activation='relu'),\n", " layers.Dense(128, activation='relu'),\n", " layers.Dense(1)\n", "])\n", "\n", "model.compile(optimizer='adam',\n", " loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),\n", " metrics=['accuracy'])\n", "\n", "model.fit(train_ds,\n", " validation_data=val_ds,\n", " epochs=5)" ], "execution_count": 23, "outputs": [ { "output_type": "stream", "text": [ "Train for 7 steps, validate for 2 steps\n", "Epoch 1/5\n", "7/7 [==============================] - 1s 151ms/step - loss: 1.2195 - accuracy: 0.6062 - val_loss: 0.6267 - val_accuracy: 0.7347\n", "Epoch 2/5\n", "7/7 [==============================] - 0s 9ms/step - loss: 0.5732 - accuracy: 0.6788 - val_loss: 0.5630 - val_accuracy: 0.7143\n", "Epoch 3/5\n", "7/7 [==============================] - 0s 8ms/step - loss: 0.6204 - accuracy: 0.7150 - val_loss: 0.5220 - val_accuracy: 0.7551\n", "Epoch 4/5\n", "7/7 [==============================] - 0s 10ms/step - loss: 0.5884 - accuracy: 0.6995 - val_loss: 0.5647 - val_accuracy: 0.7143\n", "Epoch 5/5\n", "7/7 [==============================] - 0s 10ms/step - loss: 0.5835 - accuracy: 0.6943 - val_loss: 0.5326 - val_accuracy: 0.7347\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 23 } ] }, { "cell_type": "code", "metadata": { "id": "lTv6yijpnD75", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 52 }, "outputId": "7604b025-e393-4db5-d86c-fc3fea370497" }, "source": [ "loss, accuracy = model.evaluate(test_ds)\n", "print(\"Accuracy\", accuracy)" ], "execution_count": 24, "outputs": [ { "output_type": "stream", "text": [ "2/2 [==============================] - 0s 4ms/step - loss: 0.5216 - accuracy: 0.8197\n", "Accuracy 0.8196721\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "XPiqjlZAnGQi", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }