Andrews Curves#
Andrews curves visualize clusters in multivariate data by mapping each observation to a finite Fourier series and drawing it as a single line. Observations that belong to the same group tend to trace out similar shapes.
A curve has the functional form:
f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) + x_4 sin(2t) + ...
where the x coefficients are the (normalized) values of each dimension and
t is spaced linearly between -pi and +pi.
This example shows an Andrews curves chart of the Penguins dataset, where each species traces a recognizable band of curves.
import numpy as np
import pandas as pd
import altair as alt
from altair.datasets import data
col_class = "Species"
cols = [
"Beak Length (mm)",
"Beak Depth (mm)",
"Flipper Length (mm)",
"Body Mass (g)",
]
samples = 100
source = data.penguins().dropna(subset=[*cols, col_class])
# Take an even sample of each species so the chart stays light while keeping
# all groups represented.
frame = source.groupby(col_class, observed=True).head(15).reset_index(drop=True)
t = np.linspace(-np.pi, np.pi, samples)
# Normalize each dimension to (0, 1) so that no single column dominates the curve.
values = frame[cols].to_numpy(dtype=float)
values = (values - values.min(axis=0)) / (values.max(axis=0) - values.min(axis=0))
values = values.T
curves = np.outer(values[0], np.ones_like(t)) / np.sqrt(2)
for i in range(1, len(values)):
fn = np.sin if i % 2 else np.cos
curves += np.outer(values[i], fn(((i + 1) // 2) * t))
rows = len(frame)
plot_data = pd.DataFrame(
{
"t": np.tile(t, rows),
"value": curves.ravel(),
"sample": np.repeat(np.arange(rows), samples),
col_class: np.repeat(frame[col_class].to_numpy(), samples),
}
)
alt.Chart(plot_data).mark_line().encode(
x="t:Q",
y=alt.Y("value:Q").title(None),
color=f"{col_class}:N",
detail="sample:N",
opacity=alt.value(0.5),
)
import numpy as np
import pandas as pd
import altair as alt
from altair.datasets import data
col_class = "Species"
cols = [
"Beak Length (mm)",
"Beak Depth (mm)",
"Flipper Length (mm)",
"Body Mass (g)",
]
samples = 100
source = data.penguins().dropna(subset=[*cols, col_class])
# Take an even sample of each species so the chart stays light while keeping
# all groups represented.
frame = source.groupby(col_class, observed=True).head(15).reset_index(drop=True)
t = np.linspace(-np.pi, np.pi, samples)
# Normalize each dimension to (0, 1) so that no single column dominates the curve.
values = frame[cols].to_numpy(dtype=float)
values = (values - values.min(axis=0)) / (values.max(axis=0) - values.min(axis=0))
values = values.T
curves = np.outer(values[0], np.ones_like(t)) / np.sqrt(2)
for i in range(1, len(values)):
fn = np.sin if i % 2 else np.cos
curves += np.outer(values[i], fn(((i + 1) // 2) * t))
rows = len(frame)
plot_data = pd.DataFrame(
{
"t": np.tile(t, rows),
"value": curves.ravel(),
"sample": np.repeat(np.arange(rows), samples),
col_class: np.repeat(frame[col_class].to_numpy(), samples),
}
)
alt.Chart(plot_data).mark_line().encode(
x="t:Q",
y=alt.Y("value:Q", title=None),
color=f"{col_class}:N",
detail="sample:N",
opacity=alt.value(0.5),
)