-
Notifications
You must be signed in to change notification settings - Fork 3
Clustering
s.ciccolella edited this page Jun 25, 2020
·
5 revisions
NOTE: To replicate this code it is necessary to clone the repository, the trees used here are located in mp3treesim/examples/trees.
A Jupyter Notebook of this file is available at mp3treesim/examples/clustering.ipynb.
import mp3treesim as mp3import glob
trees = list()
for file in glob.glob('trees/tree*.gv'):
trees.append(mp3.read_dotfile(file))
tot_trees = len(trees)import numpy as np
sim_matrix = np.zeros((tot_trees, tot_trees))
for index, _ in np.ndenumerate(sim_matrix):
i, j = index
sim_matrix[index] = mp3.similarity(trees[i], trees[j])import seaborn as sns
import matplotlib.pyplot as plt
sns.clustermap(sim_matrix, cbar_pos=(1.1, .2, .03, .4))
plt.show()
from scipy.cluster.hierarchy import linkage, fcluster
# Build linkage clustering
Z = linkage(sim_matrix)
# Get labelling with a cut of 3 clusters
labels = fcluster(Z, 3, criterion="maxclust")from sklearn.metrics import silhouette_score
# Transform the similarity matrix to a distance matrix
# as needed for a silhouette score computation
dist_matrix = np.zeros_like(sim_matrix)
for index, _ in np.ndenumerate(sim_matrix):
dist_matrix[index] = 1 - sim_matrix[index]
# Compute silhouette scores at different cuts
sil_scores = list()
for n_clust in range(2, 10):
nodes = fcluster(Z, n_clust, criterion="maxclust")
sil_scores.append(silhouette_score(dist_matrix, nodes, metric='precomputed'))
sns.lineplot(x=range(2, 10), y=sil_scores)<matplotlib.axes._subplots.AxesSubplot at 0x7f80213225f8>

MP3 tree similarity -- Version 1.0.6