{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Preparation\n", "After we have seen how our data looks, next we need to spend some time preparing our data for use. This means we need to ensure that the data types are correct, that we have dealt with \"dirty\" data (missing fields, outliers, unary values), and possibly creating new factors which can enhance predictive models or better explain the data we have." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'pandas'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn [1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39m# Import relevant libraries\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpyplot\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mplt\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mseaborn\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39msns\u001b[39;00m\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pandas'" ] } ], "source": [ "# Import relevant libraries\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import sys\n", "sys.path.append('..')\n", "from src.data import load_data\n", "\n", "# If you prefer a different style, pick from this list\n", "# plt.style.available\n", "pd.set_option('display.precision',4)\n", "plt.style.use('fivethirtyeight')\n", "plt.rcParams[\"figure.figsize\"] = (20, 10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before we jump in, we'll use the [cereal](DataDictionary.ipynb#cereal) dataset for some of this example." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspaces/py4analytics/book/data/Cereals_dirty.csv\n" ] }, { "data": { "text/html": [ "
\n", " | name | \n", "mfr | \n", "type | \n", "calories | \n", "protein | \n", "fat | \n", "sodium | \n", "fiber | \n", "carbo | \n", "sugars | \n", "potass | \n", "vitamins | \n", "shelf | \n", "weight | \n", "cups | \n", "rating | \n", "foodtype | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "100%_Bran | \n", "N | \n", "C | \n", "70 | \n", "4 | \n", "1 | \n", "130 | \n", "10.0 | \n", "5.0 | \n", "6.0 | \n", "280.0 | \n", "25 | \n", "3 | \n", "NaN | \n", "0.33 | \n", "68.4030 | \n", "breakfast | \n", "
1 | \n", "100%_Natural_Bran | \n", "Q | \n", "C | \n", "120 | \n", "3 | \n", "5 | \n", "15 | \n", "2.0 | \n", "8.0 | \n", "8.0 | \n", "135.0 | \n", "0 | \n", "3 | \n", "1.0 | \n", "1.00 | \n", "33.9837 | \n", "breakfast | \n", "
2 | \n", "All-Bran | \n", "K | \n", "C | \n", "70 | \n", "4 | \n", "1 | \n", "260 | \n", "9.0 | \n", "7.0 | \n", "5.0 | \n", "320.0 | \n", "25 | \n", "3 | \n", "1.0 | \n", "0.33 | \n", "59.4255 | \n", "breakfast | \n", "
3 | \n", "All-Bran_with_Extra_Fiber | \n", "K | \n", "C | \n", "50 | \n", "4 | \n", "0 | \n", "140 | \n", "14.0 | \n", "8.0 | \n", "0.0 | \n", "330.0 | \n", "25 | \n", "3 | \n", "1.0 | \n", "0.50 | \n", "93.7049 | \n", "breakfast | \n", "
4 | \n", "Almond_Delight | \n", "R | \n", "C | \n", "110 | \n", "2 | \n", "2 | \n", "200 | \n", "1.0 | \n", "14.0 | \n", "8.0 | \n", "NaN | \n", "25 | \n", "3 | \n", "1.0 | \n", "0.75 | \n", "34.3848 | \n", "breakfast | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
72 | \n", "Triples | \n", "G | \n", "C | \n", "110 | \n", "2 | \n", "1 | \n", "250 | \n", "0.0 | \n", "21.0 | \n", "3.0 | \n", "60.0 | \n", "25 | \n", "3 | \n", "1.0 | \n", "0.75 | \n", "39.1062 | \n", "breakfast | \n", "
73 | \n", "Trix | \n", "G | \n", "C | \n", "110 | \n", "1 | \n", "1 | \n", "140 | \n", "0.0 | \n", "13.0 | \n", "12.0 | \n", "25.0 | \n", "25 | \n", "2 | \n", "1.0 | \n", "1.00 | \n", "27.7533 | \n", "breakfast | \n", "
74 | \n", "Wheat_Chex | \n", "R | \n", "C | \n", "100 | \n", "3 | \n", "1 | \n", "230 | \n", "3.0 | \n", "17.0 | \n", "3.0 | \n", "115.0 | \n", "25 | \n", "1 | \n", "1.0 | \n", "0.67 | \n", "49.7874 | \n", "breakfast | \n", "
75 | \n", "Wheaties | \n", "G | \n", "C | \n", "100 | \n", "3 | \n", "1 | \n", "200 | \n", "3.0 | \n", "17.0 | \n", "3.0 | \n", "110.0 | \n", "25 | \n", "1 | \n", "1.0 | \n", "1.00 | \n", "51.5922 | \n", "breakfast | \n", "
76 | \n", "Wheaties_Honey_Gold | \n", "G | \n", "C | \n", "110 | \n", "2 | \n", "1 | \n", "200 | \n", "1.0 | \n", "16.0 | \n", "8.0 | \n", "60.0 | \n", "25 | \n", "1 | \n", "1.0 | \n", "0.75 | \n", "36.1876 | \n", "breakfast | \n", "
77 rows × 17 columns
\n", "