Last active
February 10, 2026 08:35
-
-
Save trueroad/c0fd706b5a68bb5d36698bbce72f1bef to your computer and use it in GitHub Desktop.
Perform hierarchical/agglomerative clustering.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Perform hierarchical/agglomerative clustering. | |
| https://gist.github.com/trueroad/c0fd706b5a68bb5d36698bbce72f1bef | |
| Copyright (C) 2026 Masamichi Hosoda. | |
| All rights reserved. | |
| Redistribution and use in source and binary forms, with or without | |
| modification, are permitted provided that the following conditions | |
| are met: | |
| * Redistributions of source code must retain the above copyright notice, | |
| this list of conditions and the following disclaimer. | |
| * Redistributions in binary form must reproduce the above copyright notice, | |
| this list of conditions and the following disclaimer in the documentation | |
| and/or other materials provided with the distribution. | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| ARE DISCLAIMED. | |
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| SUCH DAMAGE. | |
| """ | |
| import os | |
| import sys | |
| from typing import Any, Literal, Optional, Union | |
| import numpy as np | |
| import numpy.typing as npt | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| from scipy.cluster.hierarchy import cophenet, dendrogram, linkage | |
| from scipy.spatial.distance import squareform | |
| class Clustering: | |
| """Clustering Class.""" | |
| def __init__(self) -> None: | |
| """__init__.""" | |
| # 階層型クラスタリングのメソッド | |
| self.method: Literal['single', 'complete', | |
| 'average', 'weighted'] = 'average' | |
| # 距離行列のデータフレーム | |
| self.df: pd.DataFrame | |
| # 距離行列 | |
| self.matrix: npt.NDArray[np.float64] | |
| # 距離行列のラベル | |
| self.labels: Optional[list[str]] = None | |
| # 凝縮距離行列 | |
| self.cmatrix: npt.NDArray[np.float64] | |
| # リンケージ行列(階層型クラスタリング結果) | |
| self.linkage: npt.NDArray[np.float64] | |
| def load_distance_matrix(self, | |
| filename: Union[str, os.PathLike[str]] | |
| ) -> None: | |
| """ | |
| Load distance matrix .csv file. | |
| 距離行列を.csvファイルから読み込む | |
| Args: | |
| filename (Union[str, os.PathLike[str]]): .csvファイルのパス | |
| """ | |
| print('\nLoading distance matrix .csv file.') | |
| # データフレームとして読み込む | |
| self.df = pd.read_csv(filename, | |
| header=0, | |
| index_col=0, | |
| float_precision='round_trip') | |
| # データフレームから距離行列を読み込む | |
| self.matrix = self.df.to_numpy() | |
| # データフレームのカラムをラベルに設定する | |
| self.labels = list(self.df.columns) | |
| def finitize_distance_matrix(self) -> None: | |
| """ | |
| Finitize distance matrix. | |
| 距離行列を有限化する | |
| """ | |
| print('\nFinitizing the distance matrix.') | |
| # infやnanを除いた有限の最大値を求める | |
| max_finite: np.float64 = self.matrix[np.isfinite(self.matrix)].max() | |
| print(f'max_finite: {max_finite}') | |
| # 対角成分がinfやnanだったら0に置き換える | |
| for i in range(min(self.matrix.shape[0], self.matrix.shape[1])): | |
| if not np.isfinite(self.matrix[i][i]): | |
| print(f'Warning: The distance matrix[{i}][{i}] is ' | |
| f'{self.matrix[i][i]}. Overwrites it with zero ' | |
| 'since it is a diagonal component.') | |
| # 書き込むのはデータフレーム | |
| self.df.iat[i, i] = 0 | |
| # infやnanを有限最大の2倍に置き換える(データフレーム) | |
| self.df = self.df.replace(np.inf, max_finite * 2) | |
| self.df = self.df.replace(np.nan, max_finite * 2) | |
| # データフレームから距離行列を再度読み込む | |
| self.matrix = self.df.to_numpy() | |
| def fix_distance_matrix(self) -> None: | |
| """ | |
| Fix distance matrix. | |
| 距離行列をチェックして修正する | |
| """ | |
| print('\nFixing the distance matrix.') | |
| # 正方行列か否か確認する | |
| if not (self.matrix.ndim == 2 | |
| and (self.matrix.shape[0] == self.matrix.shape[1])): | |
| # 与えられた距離行列は正方行列ではない | |
| raise ValueError('The distance matrix is not a square matrix.') | |
| # 対角成分がゼロか確認する | |
| if not (np.allclose(np.diagonal(self.matrix), 0)): | |
| # 対角成分がゼロではない | |
| raise ValueError('Diagonal elements of the distance matrix are ' | |
| 'non zero.') | |
| # 対称行列か確認する | |
| if not (np.allclose(self.matrix, self.matrix.T)): | |
| # 対称行列ではないので対称化する | |
| print('Warning: The distance matrix is not symmetric. ' | |
| 'Makes it symmetric.') | |
| self.matrix = (self.matrix + self.matrix.T) / 2 | |
| def load_linkage_matrix(self, | |
| filename: Union[str, os.PathLike[str]] | |
| ) -> None: | |
| """ | |
| Load linkage matrix .csv file. | |
| リンケージ行列(クラスタリング結果)を.csvファイルから読み込む | |
| (クラスタリングせずに結果を流用する) | |
| Args: | |
| filename (Union[str, os.PathLike[str]]): .csvファイルのパス | |
| """ | |
| print('\nLoading linkage matrix .csv file.') | |
| # データフレームとして読み込む | |
| df = pd.read_csv(filename, | |
| header=0, | |
| index_col=0, | |
| float_precision='round_trip') | |
| # データフレームからリンケージ行列を読み込む | |
| self.linkage = df.to_numpy() | |
| def save_linkage_matrix(self, | |
| filename: Union[str, os.PathLike[str]] | |
| ) -> None: | |
| """ | |
| Save linkage matrix .csv file. | |
| リンケージ行列(クラスタリング結果)を.csvファイルに保存する | |
| Args: | |
| filename (Union[str, os.PathLike[str]]): .csvファイルのパス | |
| """ | |
| print('\nSaving linkage matrix .csv file') | |
| # リンケージ行列をデータフレーム化する | |
| df = pd.DataFrame(self.linkage) | |
| # CSVをExcelで読み込む際の文字化けを防ぐため | |
| # encodingをUTF-8 BOM付きとして出力する。 | |
| # pandasのread_csvはBOMがついていたら無視するため問題ない。 | |
| # 浮動小数点数をround tripする | |
| # (CSVを読み込んだ際に正確に再現する)ため、 | |
| # float64の精度を記録できる17桁で出力する。 | |
| df.to_csv(filename, | |
| encoding='utf-8-sig', | |
| float_format='%.17g') | |
| def calc(self) -> None: | |
| """ | |
| Calc. | |
| クラスタリングを実行する | |
| """ | |
| print('\nCalc clustering.') | |
| # 凝縮距離行列にする | |
| self.cmatrix = squareform(self.matrix) | |
| # クラスタリング実行 | |
| self.linkage = linkage(self.cmatrix, | |
| self.method, | |
| optimal_ordering=True) | |
| # CPCC(コーフェン相関係数)を計算して表示する | |
| # 1に近いほど(大きいほど)良いクラスタリングができている | |
| cpcc, _ = cophenet(self.linkage, self.cmatrix) | |
| print(f'CPCC: method {self.method}: {cpcc}') | |
| def show_dendrogram(self, | |
| xlim: Optional[float] = None, | |
| color_threshold: Optional[float] = None) -> None: | |
| """ | |
| Show dendrogram. | |
| デンドログラムを表示する | |
| Args: | |
| xlim (Optional[float]): | |
| 表示する横軸(距離)の最大、Noneはデフォルト | |
| color_threshold (Optional[float]): | |
| 指定値以下のクラスタを色で塗り分ける、Noneはデフォルト | |
| """ | |
| plt.figure(layout='tight') | |
| dendrogram(self.linkage, | |
| orientation='right', | |
| labels=self.labels, | |
| color_threshold=color_threshold) | |
| plt.xlim(0, xlim) | |
| plt.show() | |
| def main() -> None: | |
| """Do main.""" | |
| import argparse | |
| parser: argparse.ArgumentParser = argparse.ArgumentParser() | |
| parser.add_argument('--input-distance-matrix', | |
| help='(input) distance matrix .csv file.', | |
| type=str, required=False) | |
| parser.add_argument('--output-linkage-matrix', | |
| help='(output) linkage matrix .csv file.', | |
| type=str, required=False) | |
| parser.add_argument('--input-linkage-matrix', | |
| help='(input) linkage matrix .csv file.', | |
| type=str, required=False) | |
| parser.add_argument('--linkage-method', | |
| help='Linkage method. ' | |
| 'single (nearest), complete (farthest), ' | |
| 'average (UPGMA), or weighted (WPGMA).', | |
| type=str, default='average', required=False) | |
| parser.add_argument('--dendrogram-xlim', | |
| help='Dendrogarm X limit.', | |
| type=float, required=False) | |
| parser.add_argument('--dendrogram-color-threshold', | |
| help='Dendrogarm color threshold.', | |
| type=float, required=False) | |
| parser.add_argument('--show-dendrogram', | |
| help='Show dendrogram.', | |
| action='store_true') | |
| args: argparse.Namespace = parser.parse_args() | |
| vargs: dict[str, Any] = vars(args) | |
| in_distance_matrix: Optional[str] = vargs['input_distance_matrix'] | |
| out_linkage_matrix: Optional[str] = vargs['output_linkage_matrix'] | |
| in_linkage_matrix: Optional[str] = vargs['input_linkage_matrix'] | |
| linkage_method: str = vargs['linkage_method'] | |
| xlim: Optional[float] = vargs['dendrogram_xlim'] | |
| color_threshold: Optional[float] = vargs['dendrogram_color_threshold'] | |
| b_show_dendrogram: bool = vargs['show_dendrogram'] | |
| print('Files\n' | |
| f' (in) distance matrix: {in_distance_matrix}\n' | |
| f' (out) linkage matrix : {out_linkage_matrix}\n' | |
| f' (in) linkage matrix : {in_linkage_matrix}\n' | |
| 'Linkage parameters\n' | |
| f' method : {linkage_method}\n' | |
| 'Visualization parameters\n' | |
| f' Dendrogram X limit : {xlim}\n' | |
| f' Dendrogram color th : {color_threshold}\n' | |
| 'Other\n' | |
| f' Show dendrogram : {b_show_dendrogram}') | |
| if in_distance_matrix is None and in_linkage_matrix is None: | |
| # 距離行列とリンケージ行列の両方とも入力されていない | |
| print('Error: neither distance matrix nor linkage matrix ' | |
| 'is specified as input.') | |
| sys.exit(1) | |
| if out_linkage_matrix is not None and in_linkage_matrix is not None: | |
| # リンケージ行列の入力と出力を両方指定してはならない | |
| print('Error: both the input and output of the linkage matrix ' | |
| 'are specified.') | |
| sys.exit(1) | |
| if in_linkage_matrix is not None and in_distance_matrix is None: | |
| # リンケージ行列の入力がある(クラスタリングせずに結果を流用する)が | |
| # 距離行列.csvが指定されていないのでラベルが表示できない | |
| # (ラベルは距離行列から得る) | |
| print('Warning: The distance matrix .csv file is not specified, ' | |
| 'so labels cannot be displayed. ' | |
| 'Labels are obtained from the distance matrix .csv file.') | |
| cl: Clustering = Clustering() | |
| if in_distance_matrix is not None: | |
| cl.load_distance_matrix(in_distance_matrix) | |
| if in_linkage_matrix is not None: | |
| cl.load_linkage_matrix(in_linkage_matrix) | |
| else: | |
| cl.finitize_distance_matrix() | |
| cl.fix_distance_matrix() | |
| cl.calc() | |
| if out_linkage_matrix is not None: | |
| cl.save_linkage_matrix(out_linkage_matrix) | |
| if b_show_dendrogram: | |
| cl.show_dendrogram(xlim=xlim, color_threshold=color_threshold) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment