Confidence Interval Ellipses#
This example shows bivariate deviation ellipses of flipper length and body mass of three penguin species.
Inspired by ggplot2.stat_ellipse and directly based on Deviation ellipses example by @essicolo
import numpy as np
import pandas as pd
from scipy.stats import f as F
import altair as alt
from altair.datasets import data
def confidence_region_2d(arr, conf_level=0.95, segments=50):
"""
Calculate confidence interval ellipse.
Parameters
----------
arr
numpy array with 2 columns
conf_level
lower tail probability
segments
number of points describing the ellipse.
"""
n_elements = len(arr)
# Degrees of freedom of the chi-squared distribution in the **numerator**
dfn = 2
# Degrees of freedom of the chi-squared distribution in the **denominator**
dfd = n_elements - 1
# Percent point function at `conf_level` of an F continuous random variable
quantile = F.ppf(conf_level, dfn=dfn, dfd=dfd)
radius = np.sqrt(2 * quantile)
angles = np.arange(0, segments) * 2 * np.pi / segments
circle = np.column_stack((np.cos(angles), np.sin(angles)))
center = np.mean(arr, axis=0)
cov_mat = np.cov(arr, rowvar=False)
return center + radius * (circle @ np.linalg.cholesky(cov_mat).T)
def grouped_confidence_regions(df, col_x, col_y, col_group):
cols = [col_x, col_y]
ellipses = []
ser: pd.Series[float] = df[col_group]
for group in ser.drop_duplicates():
arr = df.loc[ser == group, cols].to_numpy(dtype=np.float64)
ellipse = pd.DataFrame(confidence_region_2d(arr), columns=cols)
ellipse[col_group] = group
ellipses.append(ellipse)
return pd.concat(ellipses).reset_index(names="order")
col_x = "Flipper Length (mm)"
col_y = "Body Mass (g)"
col_group = "Species"
x = alt.X(col_x).scale(zero=False)
y = alt.Y(col_y).scale(zero=False)
color = alt.Color(col_group)
source = data.penguins().dropna(subset=[col_x, col_y, col_group])
ellipse = grouped_confidence_regions(source, col_x=col_x, col_y=col_y, col_group=col_group)
points = alt.Chart(source).mark_circle(size=50, tooltip=True).encode(
x=x,
y=y,
color=color
)
lines = alt.Chart(ellipse).mark_line(filled=True, fillOpacity=0.2).encode(
x=x,
y=y,
color=color,
order="order"
)
chart = (lines + points).properties(height=500, width=500)
chart
import numpy as np
import pandas as pd
from scipy.stats import f as F
import altair as alt
from altair.datasets import data
def confidence_region_2d(arr, conf_level=0.95, segments=50):
"""
Calculate confidence interval ellipse.
Parameters
----------
arr
numpy array with 2 columns
conf_level
lower tail probability
segments
number of points describing the ellipse.
"""
n_elements = len(arr)
# Degrees of freedom of the chi-squared distribution in the **numerator**
dfn = 2
# Degrees of freedom of the chi-squared distribution in the **denominator**
dfd = n_elements - 1
# Percent point function at `conf_level` of an F continuous random variable
quantile = F.ppf(conf_level, dfn=dfn, dfd=dfd)
radius = np.sqrt(2 * quantile)
angles = np.arange(0, segments) * 2 * np.pi / segments
circle = np.column_stack((np.cos(angles), np.sin(angles)))
center = np.mean(arr, axis=0)
cov_mat = np.cov(arr, rowvar=False)
return center + radius * (circle @ np.linalg.cholesky(cov_mat).T)
def grouped_confidence_regions(df, col_x, col_y, col_group):
cols = [col_x, col_y]
ellipses = []
ser: pd.Series[float] = df[col_group]
for group in ser.drop_duplicates():
arr = df.loc[ser == group, cols].to_numpy(dtype=np.float64)
ellipse = pd.DataFrame(confidence_region_2d(arr), columns=cols)
ellipse[col_group] = group
ellipses.append(ellipse)
return pd.concat(ellipses).reset_index(names="order")
col_x = "Flipper Length (mm)"
col_y = "Body Mass (g)"
col_group = "Species"
x = alt.X(col_x, scale=alt.Scale(zero=False))
y = alt.Y(col_y, scale=alt.Scale(zero=False))
color = alt.Color(col_group)
source = data.penguins().dropna(subset=[col_x, col_y, col_group])
ellipse = grouped_confidence_regions(source, col_x=col_x, col_y=col_y, col_group=col_group)
points = alt.Chart(source).mark_circle(size=50, tooltip=True).encode(
x=x,
y=y,
color=color
)
lines = alt.Chart(ellipse).mark_line(filled=True, fillOpacity=0.2).encode(
x=x,
y=y,
color=color,
order="order"
)
chart = (lines + points).properties(height=500, width=500)
chart