Dendrogram of Hierarchical Clustering#
This is a dendrogram from the result of a hierarchical clustering. It’s based on the example from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
import pandas as pd
import altair as alt
import numpy as np
# the variable `den` shown below is an exemplary output of `scipy.cluster.hierarchy.dendrogram`
# (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html#scipy.cluster.hierarchy.dendrogram)
# where the dendrogram itself is truncated such that no more than 3 levels of the dendrogram tree are shown.
den = {
'dcoord': [[0.0, 0.8187388676087964, 0.8187388676087964, 0.0],
[0.0, 1.105139508538779, 1.105139508538779, 0.0],
[0.8187388676087964,
1.3712698320830048,
1.3712698320830048,
1.105139508538779],
[0.0, 0.9099819926189507, 0.9099819926189507, 0.0],
[0.0, 1.2539936203984452, 1.2539936203984452, 0.0],
[0.9099819926189507,
1.9187528699821954,
1.9187528699821954,
1.2539936203984452],
[1.3712698320830048,
3.828052620290243,
3.828052620290243,
1.9187528699821954],
[0.0, 1.7604450194955439, 1.7604450194955439, 0.0],
[0.0, 1.845844754344974, 1.845844754344974, 0.0],
[1.7604450194955439,
4.847708507921838,
4.847708507921838,
1.845844754344974],
[0.0, 2.8139388316471536, 2.8139388316471536, 0.0],
[0.0, 2.8694176394568705, 2.8694176394568705, 0.0],
[2.8139388316471536,
6.399406819518539,
6.399406819518539,
2.8694176394568705],
[4.847708507921838,
12.300396052792589,
12.300396052792589,
6.399406819518539],
[3.828052620290243,
32.44760699959244,
32.44760699959244,
12.300396052792589]],
'icoord': [[5.0, 5.0, 15.0, 15.0],
[25.0, 25.0, 35.0, 35.0],
[10.0, 10.0, 30.0, 30.0],
[45.0, 45.0, 55.0, 55.0],
[65.0, 65.0, 75.0, 75.0],
[50.0, 50.0, 70.0, 70.0],
[20.0, 20.0, 60.0, 60.0],
[85.0, 85.0, 95.0, 95.0],
[105.0, 105.0, 115.0, 115.0],
[90.0, 90.0, 110.0, 110.0],
[125.0, 125.0, 135.0, 135.0],
[145.0, 145.0, 155.0, 155.0],
[130.0, 130.0, 150.0, 150.0],
[100.0, 100.0, 140.0, 140.0],
[40.0, 40.0, 120.0, 120.0]],
'ivl': [
'(7)', '(8)', '41', '(5)', '(10)', '(7)', '(4)', '(8)', '(9)', '(15)', '(5)', '(7)', '(4)', '(22)', '(15)', '(23)'
],
}
def get_leaf_loc(den):
"""
Get the location of the leaves
"""
_from = int(np.array(den["icoord"]).min())
_to = int(np.array(den["icoord"]).max() + 1)
return range(_from, _to, 10)
def get_df_coord(den):
"""
Get coordinate dataframe.
"""
# if you view the dendrogram as a collection of upside-down "U" shapes, then
# we can regard the 4 corners of the upside-down "U" as points 1, 2, 3 and 4.
cols_xk = ["xk1", "xk2", "xk3", "xk4"]
cols_yk = ["yk1", "yk2", "yk3", "yk4"]
df_coord = pd.merge(
pd.DataFrame(den["icoord"], columns=cols_xk),
pd.DataFrame(den["dcoord"], columns=cols_yk),
left_index=True,
right_index=True
)
return df_coord
source = get_df_coord(den)
base = alt.Chart(source)
# the U shape is composed of a shoulder plus two arms
shoulder = base.mark_rule().encode(
alt.X("xk2:Q", title=""),
alt.X2("xk3:Q"),
alt.Y("yk2:Q", title="")
)
arm1 = base.mark_rule().encode(
alt.X("xk1:Q"),
alt.Y("yk1:Q"),
alt.Y2("yk2:Q")
)
arm2 = base.mark_rule().encode(
alt.X("xk3:Q"),
alt.Y("yk3:Q"),
alt.Y2("yk4:Q")
)
chart_den = shoulder + arm1 + arm2
df_text = pd.DataFrame(dict(labels=den["ivl"], x=get_leaf_loc(den)))
chart_text = alt.Chart(
df_text
).mark_text(
dy=0, angle=0, align="center"
).encode(
x = alt.X("x:Q", axis={"grid":False, "title":"Number of points in nodes"}),
text = alt.Text("labels:N")
)
(chart_den & chart_text).resolve_scale(
x="shared"
).configure(
padding={"top":10,"left":10}
).configure_concat(
spacing=0
).configure_axis(
labels=False,
ticks=False,
grid=False
).properties(
title="Hierarchical Clustering Dendrogram"
)
import pandas as pd
import altair as alt
import numpy as np
# the variable `den` shown below is an exemplary output of `scipy.cluster.hierarchy.dendrogram`
# (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html#scipy.cluster.hierarchy.dendrogram)
# where the dendrogram itself is truncated such that no more than 3 levels of the dendrogram tree are shown.
den = {
'dcoord': [[0.0, 0.8187388676087964, 0.8187388676087964, 0.0],
[0.0, 1.105139508538779, 1.105139508538779, 0.0],
[0.8187388676087964,
1.3712698320830048,
1.3712698320830048,
1.105139508538779],
[0.0, 0.9099819926189507, 0.9099819926189507, 0.0],
[0.0, 1.2539936203984452, 1.2539936203984452, 0.0],
[0.9099819926189507,
1.9187528699821954,
1.9187528699821954,
1.2539936203984452],
[1.3712698320830048,
3.828052620290243,
3.828052620290243,
1.9187528699821954],
[0.0, 1.7604450194955439, 1.7604450194955439, 0.0],
[0.0, 1.845844754344974, 1.845844754344974, 0.0],
[1.7604450194955439,
4.847708507921838,
4.847708507921838,
1.845844754344974],
[0.0, 2.8139388316471536, 2.8139388316471536, 0.0],
[0.0, 2.8694176394568705, 2.8694176394568705, 0.0],
[2.8139388316471536,
6.399406819518539,
6.399406819518539,
2.8694176394568705],
[4.847708507921838,
12.300396052792589,
12.300396052792589,
6.399406819518539],
[3.828052620290243,
32.44760699959244,
32.44760699959244,
12.300396052792589]],
'icoord': [[5.0, 5.0, 15.0, 15.0],
[25.0, 25.0, 35.0, 35.0],
[10.0, 10.0, 30.0, 30.0],
[45.0, 45.0, 55.0, 55.0],
[65.0, 65.0, 75.0, 75.0],
[50.0, 50.0, 70.0, 70.0],
[20.0, 20.0, 60.0, 60.0],
[85.0, 85.0, 95.0, 95.0],
[105.0, 105.0, 115.0, 115.0],
[90.0, 90.0, 110.0, 110.0],
[125.0, 125.0, 135.0, 135.0],
[145.0, 145.0, 155.0, 155.0],
[130.0, 130.0, 150.0, 150.0],
[100.0, 100.0, 140.0, 140.0],
[40.0, 40.0, 120.0, 120.0]],
'ivl': [
'(7)', '(8)', '41', '(5)', '(10)', '(7)', '(4)', '(8)', '(9)', '(15)', '(5)', '(7)', '(4)', '(22)', '(15)', '(23)'
],
}
def get_leaf_loc(den):
"""
Get the location of the leaves
"""
_from = int(np.array(den["icoord"]).min())
_to = int(np.array(den["icoord"]).max() + 1)
return range(_from, _to, 10)
def get_df_coord(den):
"""
Get coordinate dataframe.
"""
# if you view the dendrogram as a collection of upside-down "U" shapes, then
# we can regard the 4 corners of the upside-down "U" as points 1, 2, 3 and 4.
cols_xk = ["xk1", "xk2", "xk3", "xk4"]
cols_yk = ["yk1", "yk2", "yk3", "yk4"]
df_coord = pd.merge(
pd.DataFrame(den["icoord"], columns=cols_xk),
pd.DataFrame(den["dcoord"], columns=cols_yk),
left_index=True,
right_index=True
)
return df_coord
source = get_df_coord(den)
base = alt.Chart(source)
# the U shape is composed of a shoulder plus two arms
shoulder = base.mark_rule().encode(
alt.X("xk2:Q", title=""),
alt.X2("xk3:Q"),
alt.Y("yk2:Q", title="")
)
arm1 = base.mark_rule().encode(
alt.X("xk1:Q"),
alt.Y("yk1:Q"),
alt.Y2("yk2:Q")
)
arm2 = base.mark_rule().encode(
alt.X("xk3:Q"),
alt.Y("yk3:Q"),
alt.Y2("yk4:Q")
)
chart_den = shoulder + arm1 + arm2
df_text = pd.DataFrame(dict(labels=den["ivl"], x=get_leaf_loc(den)))
chart_text = alt.Chart(
df_text
).mark_text(
dy=0, angle=0, align="center"
).encode(
x = alt.X("x:Q", axis={"grid":False, "title":"Number of points in nodes"}),
text = alt.Text("labels:N")
)
(chart_den & chart_text).resolve_scale(
x="shared"
).configure(
padding={"top":10,"left":10}
).configure_concat(
spacing=0
).configure_axis(
labels=False,
ticks=False,
grid=False
).properties(
title="Hierarchical Clustering Dendrogram"
)# No channel encoding options are specified in this chart
# so the code is the same as for the method-based syntax.