mrocklin · July 18, 2021 16:59
diff --git a/pandas-arrow-string.ipynb b/pandas-arrow-string.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PyArrow String Columns in Pandas\n",
    "\n",
    "Pandas recently introduced a `string[pyarrow]` column dtype, which stores data in Arrow memory rather than a list of Python strings.  This results in more memory efficiency, faster speeds, and increased parallelism.  \n",
    "\n",
    "This notebook establishes these points first with Pandas, and then in parallel with Dask."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grab some text data\n",
    "\n",
    "Available here: https://gist.github.com/dannguyen/69c08015ae4443f1e651a3d3b1c5a036"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"/home/mrocklin/data/arcos_all_washpost.tsv\", sep=\"\\t\", nrows=1000000)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lower Size in Memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.utils import format_bytes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "format_bytes(df.REPORTER_NAME.memory_usage(deep=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "format_bytes(df.REPORTER_NAME.astype(\"string\").memory_usage(deep=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "format_bytes(df.REPORTER_NAME.astype(\"string[pyarrow]\").memory_usage(deep=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Faster to compute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "python = df.REPORTER_NAME.astype(\"string\")\n",
    "arrow = df.REPORTER_NAME.astype(\"string[pyarrow]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit python.str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit arrow.str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit python.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit arrow.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Releases the GIL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext ptime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%ptime -n 4 python.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%ptime -n 4 arrow.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Now with Dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "client = Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask\n",
    "\n",
    "df = dask.datasets.timeseries(\n",
    "    start=\"2021-01-01\",\n",
    "    end=\"2022-01-01\",\n",
    "    freq=\"500ms\",\n",
    ").persist()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.utils import format_bytes\n",
    "\n",
    "format_bytes(df.memory_usage(deep=True).sum().compute())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time df.groupby(\"name\").x.mean().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"name\"] = df[\"name\"].astype(\"string[pyarrow]\")\n",
    "df = df.persist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "format_bytes(df.memory_usage(deep=True).sum().compute())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time df.groupby(\"name\").x.mean().compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Trim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ctypes\n",
    "def trim_memory() -> int:\n",
    "     libc = ctypes.CDLL(\"libc.so.6\")\n",
    "     return libc.malloc_trim(0)\n",
    "client.run(trim_memory)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# PyArrow String Columns in Pandas\n",
	"\n",
	"Pandas recently introduced a `string[pyarrow]` column dtype, which stores data in Arrow memory rather than a list of Python strings. This results in more memory efficiency, faster speeds, and increased parallelism. \n",
	"\n",
	"This notebook establishes these points first with Pandas, and then in parallel with Dask."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Grab some text data\n",
	"\n",
	"Available here: https://gist.github.com/dannguyen/69c08015ae4443f1e651a3d3b1c5a036"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"\n",
	"df = pd.read_csv(\"/home/mrocklin/data/arcos_all_washpost.tsv\", sep=\"\\t\", nrows=1000000)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Lower Size in Memory"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from dask.utils import format_bytes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"format_bytes(df.REPORTER_NAME.memory_usage(deep=True))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"format_bytes(df.REPORTER_NAME.astype(\"string\").memory_usage(deep=True))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"format_bytes(df.REPORTER_NAME.astype(\"string[pyarrow]\").memory_usage(deep=True))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Faster to compute"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"python = df.REPORTER_NAME.astype(\"string\")\n",
	"arrow = df.REPORTER_NAME.astype(\"string[pyarrow]\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%timeit python.str.lower()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%timeit arrow.str.lower()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%timeit python.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%timeit arrow.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Releases the GIL"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%load_ext ptime"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%ptime -n 4 python.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%ptime -n 4 arrow.value_counts()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Now with Dask"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from dask.distributed import Client\n",
	"client = Client()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import dask\n",
	"\n",
	"df = dask.datasets.timeseries(\n",
	" start=\"2021-01-01\",\n",
	" end=\"2022-01-01\",\n",
	" freq=\"500ms\",\n",
	").persist()\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"len(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from dask.utils import format_bytes\n",
	"\n",
	"format_bytes(df.memory_usage(deep=True).sum().compute())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%time df.groupby(\"name\").x.mean().compute()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df[\"name\"] = df[\"name\"].astype(\"string[pyarrow]\")\n",
	"df = df.persist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"format_bytes(df.memory_usage(deep=True).sum().compute())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%time df.groupby(\"name\").x.mean().compute()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Trim"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import ctypes\n",
	"def trim_memory() -> int:\n",
	" libc = ctypes.CDLL(\"libc.so.6\")\n",
	" return libc.malloc_trim(0)\n",
	"client.run(trim_memory)\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
No results found