Browse Source

cache .xlsx as .feather

Fabian Peter Hammerle 4 years ago
parent
commit
2de337428a
3 changed files with 51 additions and 44 deletions
  1. 1 0
      .gitignore
  2. 1 1
      container
  3. 49 43
      pollution.ipynb

+ 1 - 0
.gitignore

@@ -1,2 +1,3 @@
+*.feather
 *.xlsx
 *.zip

+ 1 - 1
container

@@ -1 +1 @@
-Subproject commit 567a1d7113fbb7e2b6dbf16e58b28613059910ea
+Subproject commit f045b61fda4d3dcdd22809745288cf1ea1c4afc4

+ 49 - 43
pollution.ipynb

@@ -12,61 +12,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Archive:  E-PRTR_database_v17_xls.zip\n",
+      "  inflating: Pollutant releases.xlsx  \n"
+     ]
+    }
+   ],
    "source": [
     "!unzip E-PRTR_database_v17_xls.zip 'Pollutant releases.xlsx'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas\n",
+    "%matplotlib inline\n",
+    "%config InlineBackend.figure_format = 'retina'\n",
     "\n",
-    "pollutant_releases = pandas.read_excel('Pollutant releases.xlsx')\n",
-    "pollutant_releases.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "import pandas\n",
     "import geopandas\n",
-    "\n",
-    "# http://geopandas.org/gallery/create_geopandas_from_pandas.html\n",
-    "pollutant_releases_geo = geopandas.GeoDataFrame(\n",
-    "    pollutant_releases[[\n",
-    "        'FacilityID', 'Lat', 'Long', 'ReportingYear', 'PollutantName', \n",
-    "        'PollutantGroupName', 'TotalQuantity', 'TotalQuantity',\n",
-    "    ]],\n",
-    "    geometry=geopandas.points_from_xy(\n",
-    "        pollutant_releases['Long'],\n",
-    "        pollutant_releases['Lat'],\n",
-    "    ),\n",
-    ")\n",
-    "for column_name in pollutant_releases_geo.select_dtypes('object'):\n",
-    "    pollutant_releases_geo[column_name] = pollutant_releases_geo[column_name].astype('category')\n",
-    "pollutant_releases_geo.head()"
+    "import geoplot"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "<class 'geopandas.geodataframe.GeoDataFrame'>\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
       "RangeIndex: 652351 entries, 0 to 652350\n",
-      "Data columns (total 9 columns):\n",
+      "Data columns (total 7 columns):\n",
       "FacilityID            652351 non-null int64\n",
       "Lat                   652351 non-null float64\n",
       "Long                  652351 non-null float64\n",
@@ -74,15 +61,26 @@
       "PollutantName         652351 non-null category\n",
       "PollutantGroupName    652351 non-null category\n",
       "TotalQuantity         652351 non-null float64\n",
-      "TotalQuantity         652351 non-null float64\n",
-      "geometry              652351 non-null geometry\n",
-      "dtypes: category(2), float64(4), geometry(1), int64(2)\n",
-      "memory usage: 36.1 MB\n"
+      "dtypes: category(2), float64(3), int64(2)\n",
+      "memory usage: 26.1 MB\n",
+      "None\n",
+      "CPU times: user 3min, sys: 900 ms, total: 3min 1s\n",
+      "Wall time: 3min 1s\n"
      ]
     }
    ],
    "source": [
-    "pollutant_releases_geo.info()"
+    "%%time\n",
+    "\n",
+    "pollutant_releases = pandas.read_excel('Pollutant releases.xlsx')[[\n",
+    "    'FacilityID', 'Lat', 'Long', 'ReportingYear',\n",
+    "    'PollutantName', 'PollutantGroupName', 'TotalQuantity',\n",
+    "]]\n",
+    "for column_name in pollutant_releases.select_dtypes('object'):\n",
+    "    pollutant_releases[column_name] = pollutant_releases[column_name].astype('category')\n",
+    "pollutant_releases.to_feather('pollutant_releases.feather')\n",
+    "print(pollutant_releases.info())\n",
+    "del pollutant_releases;"
    ]
   },
   {
@@ -91,21 +89,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))"
+    "# http://geopandas.org/gallery/create_geopandas_from_pandas.html\n",
+    "pollutant_releases = pandas.read_feather('pollutant_releases.feather')\n",
+    "pollutant_releases_geo = geopandas.GeoDataFrame(\n",
+    "    pollutant_releases,\n",
+    "    geometry=geopandas.points_from_xy(\n",
+    "        pollutant_releases['Long'],\n",
+    "        pollutant_releases['Lat'],\n",
+    "    ),\n",
+    ")\n",
+    "del pollutant_releases\n",
+    "pollutant_releases_geo.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%config InlineBackend.figure_format = 'retina'"
+    "world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,8 +132,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import geoplot\n",
-    "\n",
     "ax = geoplot.kdeplot(\n",
     "    european_facilities_2011,\n",
     "    clip=world.geometry,\n",