Created
March 30, 2021 04:17
-
-
Save gerrymanoim/da7f85e069adc3cd075dc32eb5a27984 to your computer and use it in GitHub Desktop.
Python implementation of IForestIndex
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# IForestIndex in Python\n", | |
| "\n", | |
| "via https://thume.ca/2021/03/14/iforests/. Not that one would do this, but for learning purposes. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Notes\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "via https://thume.ca/2021/03/14/iforests/\n", | |
| "\n", | |
| "> The general idea behind data structures to accelerate range queries is that pre-aggregating elements into chunks of varying sizes can save work at query-time. When we get the range we want to query, we pick the set of chunks that together make up the range and aggregate them together, as opposed to aggregating all the individual elements in the range.\n", | |
| "\n", | |
| "> The in-order layout is a way to store an aggregation tree in an array where every even indexed element is a leaf and every odd indexed element aggregates to its left and right using some associative operation (the diagram uses sum). The aggregating nodes form a binary tree structure such that the first level aggregates two leaf nodes, the second level aggregates two level one aggregation nodes, etc…\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Implementation" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from typing import *\n", | |
| "import operator" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def trailing_ones(v: int):\n", | |
| " s = bin(v)\n", | |
| " return len(s)-len(s.rstrip(\"1\"))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "class IForestIndex:\n", | |
| " def __init__(self, func: Callable, identity: Callable):\n", | |
| " self._storage = list()\n", | |
| " self._func = func\n", | |
| " self._identity = identity\n", | |
| " \n", | |
| " def __getitem__(self, q: Union[int, slice]):\n", | |
| " \"\"\"getitem implements direct access and range query\"\"\"\n", | |
| " if isinstance(q, int):\n", | |
| " return self._storage[q]\n", | |
| " elif isinstance(q , slice):\n", | |
| " idx_start, idx_stop = q.start*2, q.stop*2\n", | |
| " n_vals = len(self)\n", | |
| " assert (idx_start <= n_vals) and (idx_stop <= n_vals)\n", | |
| " return self.range_query(idx_start, idx_stop)\n", | |
| " else:\n", | |
| " raise ValueError(f\"Don't know how to index using q\")\n", | |
| " \n", | |
| " def __len__(self):\n", | |
| " return len(self._storage)\n", | |
| " \n", | |
| " def values(self):\n", | |
| " return [v for i, v in enumerate(self._storage) if i%2==0]\n", | |
| " \n", | |
| " def append(self, item):\n", | |
| " self._storage.append(item)\n", | |
| " n_vals = len(self)\n", | |
| " levels_to_index = trailing_ones(n_vals) - 1\n", | |
| " current_level = n_vals - 1\n", | |
| " for level in range(levels_to_index):\n", | |
| " prev_higher_level = current_level-(2**level)\n", | |
| " combined = self._func(self[prev_higher_level], self[current_level])\n", | |
| " self._storage[prev_higher_level] = combined\n", | |
| " current_level = prev_higher_level\n", | |
| " \n", | |
| " self._storage.append(self._storage[n_vals-(2**levels_to_index)])\n", | |
| " \n", | |
| " def range_query(self, start: int, stop: int):\n", | |
| " combined = self._identity()\n", | |
| " \n", | |
| " # not standard, but easy to read\n", | |
| " left_child_at = lambda node, level: (node >> level)&1 == 0\n", | |
| " skip = lambda level: 2**(level+1)\n", | |
| " agg_node = lambda node, level: node + (2**level) - 1\n", | |
| " \n", | |
| " while start < stop:\n", | |
| " up_level = 1\n", | |
| " while left_child_at(start, up_level) & (start + skip(up_level) <= stop):\n", | |
| " up_level += 1\n", | |
| " \n", | |
| " level = up_level - 1\n", | |
| " \n", | |
| " combined = self._func(combined, self._storage[agg_node(start, level)])\n", | |
| " start += skip(level)\n", | |
| " \n", | |
| " return combined\n", | |
| " " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "summer = IForestIndex(func=operator.add, identity=lambda: 0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "raw_vals = [0] + list(range(2, 12))\n", | |
| "for val in raw_vals:\n", | |
| " summer.append(val)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[0, 2, 2, 9, 3, 7, 4, 35, 5, 11, 6, 26, 7, 15, 8, 35, 9, 19, 10, 19, 11, 11]" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "summer._storage" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "summer.values()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "44" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "summer[1:9]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment