Created
May 5, 2024 12:02
-
-
Save AndreFCruz/97fd5aa5aff4ddd610572eeadc548fa2 to your computer and use it in GitHub Desktop.
Investigating a NaN bug on folktables
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "aa986947-ff09-4041-b727-6abb9fc29c99", | |
| "metadata": {}, | |
| "source": [ | |
| "## Preprocessing" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "18f8a0c4-0187-4947-bbb6-8f9b3a759cc7", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import sys\n", | |
| "import logging\n", | |
| "from pathlib import Path\n", | |
| "\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from folktables import ACSDataSource" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "9867c806-3527-42e8-a8a1-42148dfa00ea", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "root_dir = Path(\"~\").expanduser()\n", | |
| "data_dir = root_dir / \"data\" / \"folktables\"\n", | |
| "data_dir.mkdir(parents=True, exist_ok=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "49cc60f1-eb09-4524-9b9e-6faf62b5560a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# download 2018 ACS data\n", | |
| "from folktables.load_acs import state_list\n", | |
| "\n", | |
| "data_source = ACSDataSource(\n", | |
| " survey_year='2018', horizon='1-Year', survey='person',\n", | |
| " root_dir=str(data_dir),\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "a2095dc3-47b0-481e-990b-ea0303ce877c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(3236107, 286)" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# data is 3236107 rows x 286 columns\n", | |
| "acs_data = data_source.get_data(states=state_list, download=True) # use download=True if not yet downloaded\n", | |
| "acs_data.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "facbd077-89f6-46d5-b8ac-5404da0e7571", | |
| "metadata": {}, | |
| "source": [ | |
| "---\n", | |
| "## Search for NaNs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "ec607408-29e3-4696-bf13-48fc1a017acf", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>RT</th>\n", | |
| " <th>SERIALNO</th>\n", | |
| " <th>DIVISION</th>\n", | |
| " <th>SPORDER</th>\n", | |
| " <th>PUMA</th>\n", | |
| " <th>REGION</th>\n", | |
| " <th>ST</th>\n", | |
| " <th>ADJINC</th>\n", | |
| " <th>PWGTP</th>\n", | |
| " <th>AGEP</th>\n", | |
| " <th>...</th>\n", | |
| " <th>PWGTP71</th>\n", | |
| " <th>PWGTP72</th>\n", | |
| " <th>PWGTP73</th>\n", | |
| " <th>PWGTP74</th>\n", | |
| " <th>PWGTP75</th>\n", | |
| " <th>PWGTP76</th>\n", | |
| " <th>PWGTP77</th>\n", | |
| " <th>PWGTP78</th>\n", | |
| " <th>PWGTP79</th>\n", | |
| " <th>PWGTP80</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>P</td>\n", | |
| " <td>2018GQ0000049</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1600</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1013097</td>\n", | |
| " <td>75</td>\n", | |
| " <td>19</td>\n", | |
| " <td>...</td>\n", | |
| " <td>140</td>\n", | |
| " <td>74</td>\n", | |
| " <td>73</td>\n", | |
| " <td>7</td>\n", | |
| " <td>76</td>\n", | |
| " <td>75</td>\n", | |
| " <td>80</td>\n", | |
| " <td>74</td>\n", | |
| " <td>7</td>\n", | |
| " <td>72</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>P</td>\n", | |
| " <td>2018GQ0000058</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1900</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1013097</td>\n", | |
| " <td>75</td>\n", | |
| " <td>18</td>\n", | |
| " <td>...</td>\n", | |
| " <td>76</td>\n", | |
| " <td>78</td>\n", | |
| " <td>7</td>\n", | |
| " <td>76</td>\n", | |
| " <td>80</td>\n", | |
| " <td>78</td>\n", | |
| " <td>7</td>\n", | |
| " <td>147</td>\n", | |
| " <td>150</td>\n", | |
| " <td>75</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>P</td>\n", | |
| " <td>2018GQ0000219</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2000</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1013097</td>\n", | |
| " <td>118</td>\n", | |
| " <td>53</td>\n", | |
| " <td>...</td>\n", | |
| " <td>117</td>\n", | |
| " <td>121</td>\n", | |
| " <td>123</td>\n", | |
| " <td>205</td>\n", | |
| " <td>208</td>\n", | |
| " <td>218</td>\n", | |
| " <td>120</td>\n", | |
| " <td>19</td>\n", | |
| " <td>123</td>\n", | |
| " <td>18</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>P</td>\n", | |
| " <td>2018GQ0000246</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2400</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1013097</td>\n", | |
| " <td>43</td>\n", | |
| " <td>28</td>\n", | |
| " <td>...</td>\n", | |
| " <td>43</td>\n", | |
| " <td>76</td>\n", | |
| " <td>79</td>\n", | |
| " <td>77</td>\n", | |
| " <td>80</td>\n", | |
| " <td>44</td>\n", | |
| " <td>46</td>\n", | |
| " <td>82</td>\n", | |
| " <td>81</td>\n", | |
| " <td>8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>P</td>\n", | |
| " <td>2018GQ0000251</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2701</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1013097</td>\n", | |
| " <td>16</td>\n", | |
| " <td>25</td>\n", | |
| " <td>...</td>\n", | |
| " <td>4</td>\n", | |
| " <td>2</td>\n", | |
| " <td>29</td>\n", | |
| " <td>17</td>\n", | |
| " <td>15</td>\n", | |
| " <td>28</td>\n", | |
| " <td>17</td>\n", | |
| " <td>30</td>\n", | |
| " <td>15</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 286 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " RT SERIALNO DIVISION SPORDER PUMA REGION ST ADJINC PWGTP \\\n", | |
| "0 P 2018GQ0000049 6 1 1600 3 1 1013097 75 \n", | |
| "1 P 2018GQ0000058 6 1 1900 3 1 1013097 75 \n", | |
| "2 P 2018GQ0000219 6 1 2000 3 1 1013097 118 \n", | |
| "3 P 2018GQ0000246 6 1 2400 3 1 1013097 43 \n", | |
| "4 P 2018GQ0000251 6 1 2701 3 1 1013097 16 \n", | |
| "\n", | |
| " AGEP ... PWGTP71 PWGTP72 PWGTP73 PWGTP74 PWGTP75 PWGTP76 PWGTP77 \\\n", | |
| "0 19 ... 140 74 73 7 76 75 80 \n", | |
| "1 18 ... 76 78 7 76 80 78 7 \n", | |
| "2 53 ... 117 121 123 205 208 218 120 \n", | |
| "3 28 ... 43 76 79 77 80 44 46 \n", | |
| "4 25 ... 4 2 29 17 15 28 17 \n", | |
| "\n", | |
| " PWGTP78 PWGTP79 PWGTP80 \n", | |
| "0 74 7 72 \n", | |
| "1 147 150 75 \n", | |
| "2 19 123 18 \n", | |
| "3 82 81 8 \n", | |
| "4 30 15 1 \n", | |
| "\n", | |
| "[5 rows x 286 columns]" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "acs_data.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "8c7b28fb-3aba-4979-b4b7-a8dafc11f626", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['RT', 'SERIALNO', 'NAICSP', 'SOCP']" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "numeric_cols = [col for col in acs_data.columns if np.issubdtype(acs_data[col].dtype, np.number)]\n", | |
| "acs_data_numeric = acs_data[numeric_cols]\n", | |
| "\n", | |
| "non_numeric_cols = [col for col in acs_data.columns if col not in numeric_cols]\n", | |
| "non_numeric_cols" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "001dcf89-c4a4-4587-bf2b-327c33f5788f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Column CITWP has 3027081 NaNs (93.5%).\n", | |
| "Column COW has 1322223 NaNs (40.9%).\n", | |
| "Column DDRS has 162925 NaNs (5.0%).\n", | |
| "Column DOUT has 535206 NaNs (16.5%).\n", | |
| "Column DPHY has 162925 NaNs (5.0%).\n", | |
| "Column DRAT has 3185384 NaNs (98.4%).\n", | |
| "Column DRATX has 2986009 NaNs (92.3%).\n", | |
| "Column DREM has 162925 NaNs (5.0%).\n", | |
| "Column ENG has 2649542 NaNs (81.9%).\n", | |
| "Column FER has 2536823 NaNs (78.4%).\n", | |
| "Column GCL has 1126766 NaNs (34.8%).\n", | |
| "Column GCM has 3211594 NaNs (99.2%).\n", | |
| "Column GCR has 3167599 NaNs (97.9%).\n", | |
| "Column INTP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2330183 zeros (72.0%).\n", | |
| "Column JWMNP has 1836484 NaNs (56.7%).\n", | |
| "Column JWRIP has 1967271 NaNs (60.8%).\n", | |
| "Column JWTR has 1752768 NaNs (54.2%).\n", | |
| "Column LANX has 162925 NaNs (5.0%).\n", | |
| "Column MARHD has 1341266 NaNs (41.4%).\n", | |
| "Column MARHM has 1341266 NaNs (41.4%).\n", | |
| "Column MARHT has 1341266 NaNs (41.4%).\n", | |
| "Column MARHW has 1341266 NaNs (41.4%).\n", | |
| "Column MARHYP has 1341266 NaNs (41.4%).\n", | |
| "Column MIG has 30520 NaNs (0.9%).\n", | |
| "Column MIL has 614704 NaNs (19.0%).\n", | |
| "Column MLPA has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 169746 zeros (5.2%).\n", | |
| "Column MLPB has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 173760 zeros (5.4%).\n", | |
| "Column MLPCD has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 160608 zeros (5.0%).\n", | |
| "Column MLPE has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 133691 zeros (4.1%).\n", | |
| "Column MLPFG has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 186670 zeros (5.8%).\n", | |
| "Column MLPH has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 197956 zeros (6.1%).\n", | |
| "Column MLPI has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 212348 zeros (6.6%).\n", | |
| "Column MLPJ has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 208638 zeros (6.4%).\n", | |
| "Column MLPK has 3021594 NaNs (93.4%).\n", | |
| "\t\t\t 214356 zeros (6.6%).\n", | |
| "Column NWAB has 574865 NaNs (17.8%).\n", | |
| "Column NWAV has 574865 NaNs (17.8%).\n", | |
| "Column NWLA has 574865 NaNs (17.8%).\n", | |
| "Column NWLK has 574865 NaNs (17.8%).\n", | |
| "Column NWRE has 574865 NaNs (17.8%).\n", | |
| "Column OIP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2548615 zeros (78.8%).\n", | |
| "Column PAP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2666594 zeros (82.4%).\n", | |
| "Column RETP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2369699 zeros (73.2%).\n", | |
| "Column SCH has 95465 NaNs (2.9%).\n", | |
| "Column SCHG has 2484712 NaNs (76.8%).\n", | |
| "Column SCHL has 95465 NaNs (2.9%).\n", | |
| "Column SEMP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2536197 zeros (78.4%).\n", | |
| "Column SSIP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2616173 zeros (80.8%).\n", | |
| "Column SSP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 2042064 zeros (63.1%).\n", | |
| "Column WAGP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 1122407 zeros (34.7%).\n", | |
| "Column WKHP has 1557981 NaNs (48.1%).\n", | |
| "Column WKL has 574865 NaNs (17.8%).\n", | |
| "Column WKW has 1557981 NaNs (48.1%).\n", | |
| "Column WRK has 859027 NaNs (26.5%).\n", | |
| "Column YOEP has 2809462 NaNs (86.8%).\n", | |
| "Column DECADE has 2809462 NaNs (86.8%).\n", | |
| "Column DRIVESP has 1967271 NaNs (60.8%).\n", | |
| "Column ESP has 2619012 NaNs (80.9%).\n", | |
| "Column ESR has 574865 NaNs (17.8%).\n", | |
| "Column FOD1P has 2439281 NaNs (75.4%).\n", | |
| "Column FOD2P has 3148321 NaNs (97.3%).\n", | |
| "Column INDP has 1322223 NaNs (40.9%).\n", | |
| "Column JWAP has 1836484 NaNs (56.7%).\n", | |
| "Column JWDP has 1836484 NaNs (56.7%).\n", | |
| "Column LANP has 2649542 NaNs (81.9%).\n", | |
| "Column MIGPUMA has 2809427 NaNs (86.8%).\n", | |
| "Column MIGSP has 2809427 NaNs (86.8%).\n", | |
| "Column MSP has 535206 NaNs (16.5%).\n", | |
| "Column NOP has 2619012 NaNs (80.9%).\n", | |
| "Column OC has 153610 NaNs (4.7%).\n", | |
| "\t\t\t 2513301 zeros (77.7%).\n", | |
| "Column OCCP has 1322223 NaNs (40.9%).\n", | |
| "Column PAOC has 1927168 NaNs (59.6%).\n", | |
| "Column PERNP has 574865 NaNs (17.8%).\n", | |
| "\t\t\t 983842 zeros (30.4%).\n", | |
| "Column PINCP has 535206 NaNs (16.5%).\n", | |
| "\t\t\t 336113 zeros (10.4%).\n", | |
| "Column POVPIP has 135180 NaNs (4.2%).\n", | |
| "\t\t\t 57749 zeros (1.8%).\n", | |
| "Column POWPUMA has 1752768 NaNs (54.2%).\n", | |
| "Column POWSP has 1752768 NaNs (54.2%).\n", | |
| "Column RC has 153610 NaNs (4.7%).\n", | |
| "\t\t\t 2444568 zeros (75.5%).\n", | |
| "Column SCIENGP has 2439281 NaNs (75.4%).\n", | |
| "Column SCIENGRLP has 2439281 NaNs (75.4%).\n", | |
| "Column SFN has 3136282 NaNs (96.9%).\n", | |
| "Column SFR has 3136282 NaNs (96.9%).\n", | |
| "Column VPS has 3021594 NaNs (93.4%).\n", | |
| "Column FHINS3C has 2530985 NaNs (78.2%).\n", | |
| "\t\t\t 657514 zeros (20.3%).\n", | |
| "Column FHINS4C has 2608417 NaNs (80.6%).\n", | |
| "\t\t\t 576508 zeros (17.8%).\n", | |
| "Column FHINS5C has 3129231 NaNs (96.7%).\n", | |
| "\t\t\t 106093 zeros (3.3%).\n", | |
| "cols_with_nans_and_zeros=['INTP', 'MLPA', 'MLPB', 'MLPCD', 'MLPE', 'MLPFG', 'MLPH', 'MLPI', 'MLPJ', 'MLPK', 'OIP', 'PAP', 'RETP', 'SEMP', 'SSIP', 'SSP', 'WAGP', 'OC', 'PERNP', 'PINCP', 'POVPIP', 'RC', 'FHINS3C', 'FHINS4C', 'FHINS5C']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "cols_with_nans = []\n", | |
| "cols_with_nans_and_zeros = []\n", | |
| "\n", | |
| "for col in acs_data_numeric.columns:\n", | |
| " col_data = acs_data_numeric[col]\n", | |
| " n_nans = np.sum(np.isnan(col_data))\n", | |
| " if n_nans > 0:\n", | |
| " cols_with_nans.append(col)\n", | |
| " print(f\"Column {col:10} has {n_nans:7} NaNs ({n_nans / len(col_data):.1%}).\")\n", | |
| "\n", | |
| " # Print number of zeros (to see overlap with representing NaNs with 0s)\n", | |
| " n_zeros = np.sum(np.isclose(col_data, 0))\n", | |
| " if n_zeros > 0:\n", | |
| " print(f\"\\t\\t\\t {n_zeros} zeros ({n_zeros / len(col_data):.1%}).\")\n", | |
| " cols_with_nans_and_zeros.append(col)\n", | |
| "\n", | |
| "print(f\"{cols_with_nans_and_zeros=}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "97e365f9-2878-42b0-9b5b-3a1a74b4c874", | |
| "metadata": {}, | |
| "source": [ | |
| "## Check which tasks are compromised\n", | |
| "> The NaNs may be on data unused by the task..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "362dd290-c605-4b7e-a41f-641fc2b88837", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'ACSIncome': <folktables.folktables.BasicProblem at 0x11cd8b6d0>,\n", | |
| " 'ACSEmployment': <folktables.folktables.BasicProblem at 0x10ac7b510>,\n", | |
| " 'ACSHealthInsurance': <folktables.folktables.BasicProblem at 0x11c6d2990>,\n", | |
| " 'ACSPublicCoverage': <folktables.folktables.BasicProblem at 0x11cd8ba50>,\n", | |
| " 'ACSTravelTime': <folktables.folktables.BasicProblem at 0x11cd8bb50>,\n", | |
| " 'ACSMobility': <folktables.folktables.BasicProblem at 0x11cd8bb10>,\n", | |
| " 'ACSEmploymentFiltered': <folktables.folktables.BasicProblem at 0x11cd8bd50>,\n", | |
| " 'ACSIncomePovertyRatio': <folktables.folktables.BasicProblem at 0x10bd244d0>}" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import folktables\n", | |
| "from folktables import BasicProblem\n", | |
| "\n", | |
| "acs_tasks = {name: obj for name, obj in folktables.__dict__.items() if isinstance(obj, BasicProblem)}\n", | |
| "acs_tasks" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "48c030a3-3a15-42d3-9f38-e4f2bd412d2c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "cb51e9f390154f2982f03db3b29010f8", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| " 0%| | 0/8 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "ACSEmployment .MIG :: NaNs==0.9%\n", | |
| "ACSEmployment .SCHL :: NaNs==2.9%\n", | |
| "ACSEmployment .MIL :: NaNs==19.0%\n", | |
| "ACSEmployment .DREM :: NaNs==5.0%\n", | |
| "ACSEmployment .ESR :: NaNs==17.8%\n", | |
| "ACSEmployment .ESP :: NaNs==80.9%\n", | |
| "ACSHealthInsurance .MIG :: NaNs==0.9%\n", | |
| "ACSHealthInsurance .MIL :: NaNs==19.0%\n", | |
| "ACSHealthInsurance .PINCP :: NaNs==16.5% :: 0s==10.4% -- compromised\n", | |
| "ACSHealthInsurance .SCHL :: NaNs==2.9%\n", | |
| "ACSHealthInsurance .DREM :: NaNs==5.0%\n", | |
| "ACSHealthInsurance .ESR :: NaNs==17.8%\n", | |
| "ACSHealthInsurance .FER :: NaNs==78.4%\n", | |
| "ACSHealthInsurance .ESP :: NaNs==80.9%\n", | |
| "ACSPublicCoverage .MIL :: NaNs==7.0%\n", | |
| "ACSPublicCoverage .ESR :: NaNs==3.5%\n", | |
| "ACSPublicCoverage .FER :: NaNs==59.7%\n", | |
| "ACSPublicCoverage .ESP :: NaNs==90.5%\n", | |
| "ACSTravelTime .JWMNP :: NaNs==5.7%\n", | |
| "ACSTravelTime .POVPIP :: NaNs==1.0% :: 0s==0.0% -- compromised\n", | |
| "ACSTravelTime .ESP :: NaNs==99.4%\n", | |
| "ACSMobility .WKHP :: NaNs==17.9%\n", | |
| "ACSMobility .COW :: NaNs==10.9%\n", | |
| "ACSMobility .JWMNP :: NaNs==29.8%\n", | |
| "ACSMobility .GCL :: NaNs==67.2%\n", | |
| "ACSMobility .ESP :: NaNs==100.0%\n", | |
| "ACSEmploymentFiltered.GCL :: NaNs==19.8%\n", | |
| "ACSEmploymentFiltered.ESP :: NaNs==98.6%\n", | |
| "ACSIncomePovertyRatio.WKHP :: NaNs==48.1%\n", | |
| "ACSIncomePovertyRatio.MIG :: NaNs==0.9%\n", | |
| "ACSIncomePovertyRatio.MIL :: NaNs==19.0%\n", | |
| "ACSIncomePovertyRatio.SCHL :: NaNs==2.9%\n", | |
| "ACSIncomePovertyRatio.DREM :: NaNs==5.0%\n", | |
| "ACSIncomePovertyRatio.GCL :: NaNs==34.8%\n", | |
| "ACSIncomePovertyRatio.ESR :: NaNs==17.8%\n", | |
| "ACSIncomePovertyRatio.POVPIP :: NaNs==4.2% :: 0s==1.8% -- compromised\n", | |
| "ACSIncomePovertyRatio.OCCP :: NaNs==40.9%\n", | |
| "ACSIncomePovertyRatio.ESP :: NaNs==80.9%\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from tqdm.auto import tqdm\n", | |
| "compromised_cols_per_task = {}\n", | |
| "\n", | |
| "for task_name, task in tqdm(acs_tasks.items()):\n", | |
| " # Filter out data not used for this task\n", | |
| " task_data = task._preprocess(acs_data)\n", | |
| "\n", | |
| " # Check if task_data has any NaNs and 0s overlap\n", | |
| " compromised_cols = []\n", | |
| " for col in (set(task_data.columns) & (set(task.features) | {task.target})):\n", | |
| " if col not in numeric_cols:\n", | |
| " continue\n", | |
| "\n", | |
| " col_data = task_data[col]\n", | |
| " n_nans = np.sum(np.isnan(col_data))\n", | |
| " n_zeros = np.sum(np.isclose(col_data, 0))\n", | |
| " if n_nans > 0:\n", | |
| " print(f\"{task_name:20}.{col:5} :: NaNs=={n_nans / len(col_data):.1%}\", end=\"\")\n", | |
| " if n_zeros > 0:\n", | |
| " print(f\" :: 0s=={n_zeros / len(col_data):.1%} -- compromised\", end=\"\")\n", | |
| " compromised_cols.append(col)\n", | |
| "\n", | |
| " print()\n", | |
| " \n", | |
| "\n", | |
| " compromised_cols_per_task[task_name] = compromised_cols\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "5cfca50e-22d3-446e-aa16-f9be60fca0ea", | |
| "metadata": {}, | |
| "source": [ | |
| "---" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment