Source code for er_evaluation.plots._fairness

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from er_evaluation.estimators import pairwise_f_estimator
from er_evaluation.utils import expand_grid


[docs]def plot_performance_disparities(
    prediction,
    reference,
    weights,
    protected_feature,
    estimator=pairwise_f_estimator,
    estimator_name="Pairwise F-score",
    max_subgroups=10,
):
    """
    Plot largest performance disparities among predefined subgroups.

    Args:
        prediction (Series): Predicted clustering.
        reference (Series): Reference clustering.
        weights (str or Series): Weights for sampled clusters, or one of "uniform" or "cluster_size".
        protected_feature (Series): Series index by reference cluster IDs and with values corresponding to group assignment.
        estimator: Function to use for performance estimation. Defaults to pairwise_f_design_estimate.
        estimator_name (str, optional): Name of the estimator to use in the plot labels. Defaults to "Pairwise F-score".
        max_subgroups (int, optional): Number of subgroups to display. Defaults to 10.

    Returns:
        plotly Figure
    """

    if protected_feature.name is None:
        protected_feature.name = "Protected attribute"

    scores_df = _make_scores_df(
        prediction, reference, weights, protected_feature.name, protected_feature, estimator_name, estimator
    )

    fig = _make_largest_difference_figure(
        scores_df,
        estimator_name,
        protected_feature=protected_feature.name,
        max_subgroups_per_control_cat_to_display=max_subgroups,
    )
    return fig


def _make_scores_df(prediction, reference, weights, protected_feature, protected_data, estimator_name, estimator):
    estimators = {estimator_name: estimator}
    table = expand_grid(**{protected_feature: protected_data.unique()}, _scorer=estimators)
    table.groupby(protected_feature)

    def apply_estimator(x):
        return estimators[x["_scorer"]](
            prediction,
            reference[reference.isin(protected_data[protected_data == x[protected_feature]].index)],
            weights=weights,
        )

    table[["_score", "_std"]] = table.apply(lambda x: apply_estimator(x), axis=1).to_list()
    table["_count"] = table.apply(
        lambda x: reference[reference.isin(protected_data[protected_data == x[protected_feature]].index)].nunique(),
        axis=1,
    )
    table[["_baseline", "_baseline_std"]] = table.apply(
        lambda x: estimators[x["_scorer"]](prediction, reference, weights=weights), axis=1
    ).to_list()
    table["_baseline_count"] = table.apply(lambda x: reference.nunique(), axis=1)
    table["_diff"] = table["_score"] - table["_baseline"]

    return table


def _add_differences_traces(
    sub_visual_df, fig, protected_feature, max_subgroups_per_control_cat_to_display, row=1, col=1
):
    """
    This function is adapted from Deepcheck's PerformanceBias module (https://deepchecks.com/) released under the AGPL-3 license.
    """
    sub_visual_df = sub_visual_df.sort_values("_diff").head(max_subgroups_per_control_cat_to_display)
    sub_visual_df = sub_visual_df.sort_values("_diff", ascending=False)
    for _, df_row in sub_visual_df.iterrows():
        subgroup = df_row[protected_feature]
        baseline = df_row["_baseline"]
        score = df_row["_score"]
        stds = [df_row["_std"], df_row["_baseline_std"]]
        color = "orangered" if df_row["_diff"] < 0 else "limegreen"
        legendgroup = "Negative differences" if df_row["_diff"] < 0 else "Positive differences"
        extra_label = "<extra></extra>"  # Hide extra label in hover

        fig.add_trace(
            go.Scatter(
                x=[score, baseline],
                y=[subgroup, subgroup],
                hovertemplate=[
                    "%{y}: %{x} (group size: " + str(df_row["_count"]) + ")" + extra_label,
                    "baseline: %{x} (group size: " + str(df_row["_baseline_count"]) + ")" + extra_label,
                ],
                marker=dict(
                    color=["white", "#222222"], symbol=0, size=6, line=dict(width=[2, 2], color=[color, color])
                ),
                legendgroup=legendgroup,
                line=dict(color=color, width=8),
                opacity=1,
                showlegend=False,
                mode="lines+text+markers",
                cliponaxis=False,
            ),
            row=row,
            col=col,
        )
        # Error bars
        fig.add_trace(
            go.Scatter(
                x=[score, baseline],
                y=[subgroup, subgroup],
                hoverinfo="skip",
                error_x=dict(array=stds, color="black", thickness=0.75),
                marker=dict(color=["white", "#222222"], symbol=0, size=6),
                legendgroup=legendgroup,
                opacity=1,
                showlegend=False,
                mode="markers",
                cliponaxis=False,
            ),
            row=row,
            col=col,
        )


def _add_legend(fig):
    """
    This function is adapted from Deepcheck's PerformanceBias module (https://deepchecks.com/) released under the AGPL-3 license.
    """
    for outline, title in [("orangered", "Negative differences"), ("limegreen", "Positive differences")]:
        for color, label in [("white", "subgroup score"), ("#222222", "baseline score")]:
            fig.add_traces(
                go.Scatter(
                    x=[None],
                    y=[None],
                    mode="markers",
                    name=label,
                    legendgroup=title,
                    legendgrouptitle=dict(text=title),
                    marker=dict(color=color, symbol=0, size=6, line=dict(width=2, color=outline)),
                )
            )

    return fig


def _make_largest_difference_figure(
    scores_df: pd.DataFrame,
    scorer_name: str,
    protected_feature,
    control_feature=None,
    max_control_cat_to_display=3,
    max_subgroups_per_control_cat_to_display=3,
):
    """
    Create 'largest performance disparity' figure.

    This function is adapted from Deepcheck's PerformanceBias module (https://deepchecks.com/) released under the AGPL-3 license.

    Parameters
    ----------
    scores_df : DataFrame
        Dataframe of performance scores, as returned by `_make_scores_df()`, disaggregated by
        feature and control_feature, and with average scores for each control_feature level.
        Columns named after `feature` and (optionally) `control_feature` are expected, as
        well as columns named '_scorer', '_score', '_baseline', and '_count'.

    Returns
    -------
    Figure
        Figure showing subgroups with the largest performance disparities.
    """
    visual_df = scores_df.copy().dropna()
    if len(visual_df) == 0:
        return "No scores to display."

    has_control = control_feature is not None
    has_model_classes = "_class" in visual_df.columns.values

    subplot_grouping = []
    if has_control:
        subplot_grouping += [control_feature]
    if has_model_classes:
        subplot_grouping += ["_class"]
    # Get distinct subplot categories with the largest observed differences
    if len(subplot_grouping) > 0:
        subplots_categories = (
            visual_df.sort_values("_diff", ascending=True)[subplot_grouping]
            .drop_duplicates()
            .head(max_control_cat_to_display)
        )
        rows = len(subplots_categories)
    else:
        subplots_categories = None
        rows = 1

    subplot_titles = ""
    if has_control:
        subplot_titles += f"{control_feature}=" + subplots_categories[control_feature]
    if has_control and has_model_classes:
        subplot_titles += ", model_class=" + subplots_categories["_class"]
    if has_model_classes and not has_control:
        subplot_titles = "model_class=" + subplots_categories["_class"]

    fig = make_subplots(
        rows=rows,
        cols=1,
        shared_xaxes=True,
        subplot_titles=subplot_titles.values if isinstance(subplot_titles, pd.Series) else None,
        vertical_spacing=0.7 / rows**1.5,
    )

    if subplots_categories is not None:
        i = 0
        for _, cat in subplots_categories.iterrows():
            i += 1
            if has_control and not has_model_classes:
                subset_i = visual_df[control_feature] == cat[control_feature]
            elif has_model_classes and not has_control:
                subset_i = visual_df["_class"] == cat["_class"]
            elif has_control and has_model_classes:
                subset_i = (visual_df[control_feature] == cat[control_feature]) & (visual_df["_class"] == cat["_class"])
            else:
                raise RuntimeError("Cannot use subplot categories without control_feature or model classes.")

            sub_visual_df = visual_df[subset_i]
            _add_differences_traces(
                sub_visual_df, fig, protected_feature, max_subgroups_per_control_cat_to_display, row=i, col=1
            )
    else:
        _add_differences_traces(
            visual_df, fig, protected_feature, max_subgroups_per_control_cat_to_display, row=1, col=1
        )

    title = "Largest performance differences"
    if has_control and not has_model_classes:
        title += f" within {control_feature} categories"
    elif has_model_classes and not has_control:
        title += " model_class categories"
    if has_control and has_model_classes:
        title += f" within {control_feature} and model_class categories"

    n_subgroups = len(visual_df[protected_feature].unique())
    n_subgroups_shown = min(n_subgroups, max_subgroups_per_control_cat_to_display)
    title += f"<br><sup>(Showing {n_subgroups_shown}/{n_subgroups} {protected_feature} categories"
    n_cat = 1
    if has_control or has_model_classes:
        n_cat = len(visual_df[subplot_grouping].drop_duplicates())
        title += f" per subplot and {rows}/{n_cat} "
        if has_control and not has_model_classes:
            title += f"{control_feature}"
        elif has_model_classes and not has_control:
            title += "model_classes"
        else:
            title += f"({control_feature}, model_classes)"
        title += " categories"
    title += ")</sup>"

    fig.update_layout(title_text=title)
    fig.update_annotations(x=0, xanchor="left", font_size=12)
    fig.update_layout({f"xaxis{rows}_title": f"{scorer_name} score"})
    fig.update_layout({f"yaxis{i}_title": protected_feature for i in range(1, rows + 1)})
    fig.update_layout({f"yaxis{i}_tickmode": "linear" for i in range(1, rows + 1)})

    fig.update_layout(height=150 + 50 * rows + 20 * rows * n_subgroups_shown)

    _add_legend(fig)

    return fig