Andrews Curves#

Andrews curves visualize clusters in multivariate data by mapping each observation to a finite Fourier series and drawing it as a single line. Observations that belong to the same group tend to trace out similar shapes.

A curve has the functional form:

f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) + x_4 sin(2t) + ...

where the x coefficients are the (normalized) values of each dimension and t is spaced linearly between -pi and +pi.

This example shows an Andrews curves chart of the Penguins dataset, where each species traces a recognizable band of curves.

import numpy as np
import pandas as pd

import altair as alt
from altair.datasets import data

col_class = "Species"
cols = [
    "Beak Length (mm)",
    "Beak Depth (mm)",
    "Flipper Length (mm)",
    "Body Mass (g)",
]
samples = 100

source = data.penguins().dropna(subset=[*cols, col_class])
# Take an even sample of each species so the chart stays light while keeping
# all groups represented.
frame = source.groupby(col_class, observed=True).head(15).reset_index(drop=True)

t = np.linspace(-np.pi, np.pi, samples)
# Normalize each dimension to (0, 1) so that no single column dominates the curve.
values = frame[cols].to_numpy(dtype=float)
values = (values - values.min(axis=0)) / (values.max(axis=0) - values.min(axis=0))
values = values.T

curves = np.outer(values[0], np.ones_like(t)) / np.sqrt(2)
for i in range(1, len(values)):
    fn = np.sin if i % 2 else np.cos
    curves += np.outer(values[i], fn(((i + 1) // 2) * t))

rows = len(frame)
plot_data = pd.DataFrame(
    {
        "t": np.tile(t, rows),
        "value": curves.ravel(),
        "sample": np.repeat(np.arange(rows), samples),
        col_class: np.repeat(frame[col_class].to_numpy(), samples),
    }
)

alt.Chart(plot_data).mark_line().encode(
    x="t:Q",
    y=alt.Y("value:Q").title(None),
    color=f"{col_class}:N",
    detail="sample:N",
    opacity=alt.value(0.5),
)
import numpy as np
import pandas as pd

import altair as alt
from altair.datasets import data

col_class = "Species"
cols = [
    "Beak Length (mm)",
    "Beak Depth (mm)",
    "Flipper Length (mm)",
    "Body Mass (g)",
]
samples = 100

source = data.penguins().dropna(subset=[*cols, col_class])
# Take an even sample of each species so the chart stays light while keeping
# all groups represented.
frame = source.groupby(col_class, observed=True).head(15).reset_index(drop=True)

t = np.linspace(-np.pi, np.pi, samples)
# Normalize each dimension to (0, 1) so that no single column dominates the curve.
values = frame[cols].to_numpy(dtype=float)
values = (values - values.min(axis=0)) / (values.max(axis=0) - values.min(axis=0))
values = values.T

curves = np.outer(values[0], np.ones_like(t)) / np.sqrt(2)
for i in range(1, len(values)):
    fn = np.sin if i % 2 else np.cos
    curves += np.outer(values[i], fn(((i + 1) // 2) * t))

rows = len(frame)
plot_data = pd.DataFrame(
    {
        "t": np.tile(t, rows),
        "value": curves.ravel(),
        "sample": np.repeat(np.arange(rows), samples),
        col_class: np.repeat(frame[col_class].to_numpy(), samples),
    }
)

alt.Chart(plot_data).mark_line().encode(
    x="t:Q",
    y=alt.Y("value:Q", title=None),
    color=f"{col_class}:N",
    detail="sample:N",
    opacity=alt.value(0.5),
)