Source code for er_evaluation.error_analysis._cluster_error

import numpy as np
import pandas as pd
from scipy.special import comb

from er_evaluation.data_structures import MembershipVector
from er_evaluation.error_analysis._record_error import (
    error_indicator_from_table, error_metrics_from_table,
    expected_relative_missing_from_table, expected_size_difference_from_table,
    record_error_table)
from er_evaluation.utils import relevant_prediction_subset


[docs]def error_metrics(prediction, sample): """ Compute canonical set of error metrics from record error table. Error metrics included: * Expected extra links (see :meth:`er_evaluation.error_analysis.expected_extra`) * Expected relative extra links (see :meth:`er_evaluation.error_analysis.expected_relative_extra`) * Expected missin elements (see :meth:`er_evaluation.error_analysis.expected_missing`) * Expected relative missin elements (see :meth:`er_evaluation.error_analysis.expected_relative_missing`) * Error indicator (see :meth:`er_evaluation.error_analysis.error_indicator`) Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: DataFrame: Dataframe indexed by cluster identifiers and with values corresponding to error metrics. Examples >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,6,7, 8], data=["c1", "c1", "c1", "c2", "c2", "c3", "c3", "c3"]) >>> error_metrics(prediction, sample) # doctest: +SKIP expected_extra expected_relative_extra expected_missing expected_relative_missing error_indicator reference c1 0.333333 0.166667 1.333333 0.444444 1 c2 0.500000 0.250000 1.000000 0.500000 1 c3 1.000000 0.333333 0.000000 0.000000 0 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] error_table = record_error_table(prediction, sample) return error_metrics_from_table(error_table)
[docs]def count_extra(prediction, sample): r""" Count the number of extraneous elements to sampled clusters. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns the count of extraneous elements for each true cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to the counts of extraneous elements. Count of extraneous elements For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`A_r` be the set of records which are erroneously linked to :math:`r` in the predicted clustering. That is, if :math:`\hat c(r)` is the predicted cluster containing :math:`r`, then :math:`A_r = \hat c(r) \backslash c` Then the count of extraneous elements for :math:`c` is .. math:: E_{\text{count_extra}}(c) = \sum_{r\in c} \lvert A_r \rvert. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the count of extraneous elements. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> count_extra(prediction, sample) reference c1 1 c2 1 c4 2 Name: count_extra, dtype: int64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] relevant_predictions = relevant_prediction_subset(prediction, sample) outer = pd.concat( {"prediction": relevant_predictions, "reference": sample}, axis=1, copy=False, join="outer", ) def lambd(sample_cluster): # Number of elements within sampled cluster split across predicted clusters: p = pd.value_counts(sample_cluster) # Number of elements within predicted clusters (restricted to current sampled cluster): u = outer.prediction.value_counts()[p.index].values n_links = np.sum(p * (u - p)) + np.sum(comb(u, 2)) if n_links == 0: return 0 return np.sum(p * (u - p)) result = outer.groupby("reference").agg(lambd).prediction result.rename("count_extra", inplace=True) return result
[docs]def expected_size_difference(prediction, sample): r""" Expected size difference between predicted and sampled clusters. Expected Size Difference: For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`\hat c(r)` be the predicted cluster containing :math:`r`. Then the expected size difference for :math:`c` is .. math:: E_{\text{size}}(c) = \frac{1}{\lvert c \rvert}\sum_{r\in c} \lvert \hat c(r) \rvert - \lvert c \rvert. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the expected size difference. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,6,7], data=["c1", "c1", "c1", "c2", "c2", "c3", "c3"]) >>> expected_size_difference(prediction, sample) reference c1 -1.0 c2 -0.5 c3 1.0 Name: expected_size_diff, dtype: float64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] error_table = record_error_table(prediction, sample) return expected_size_difference_from_table(error_table)
[docs]def expected_extra(prediction, sample): r""" Expected number of extraneous elements to records in sampled clusters. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns the expected number of extraneous elements for each true cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to the expected number of extraneous elements. Expected Number of extraneous elements For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`A_r` be the set of records which are erroneously linked to :math:`r` in the predicted clustering. That is, if :math:`\hat c(r)` is the predicted cluster containing :math:`r`, then :math:`A_r = \hat c(r) \backslash c` Then the expected number of extraneous elements for :math:`c` is .. math:: E_{\text{extra}}(c) = \frac{1}{\lvert c \rvert}\sum_{r\in c} \lvert A_r \rvert. This is the expected number of erroneous links to a random record :math:`r \in c`. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the expected number of extraneous elements. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> expected_extra(prediction, sample) reference c1 0.333333 c2 0.500000 c4 2.000000 Name: expected_extra, dtype: float64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] result = count_extra(prediction, sample) sizes = sample.groupby(sample).size() result = result / sizes result.rename("expected_extra", inplace=True) return result
[docs]def expected_relative_extra(prediction, sample): r""" Expected relative number of extraneous elements to records in sampled clusters. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns the expected number of relative extraneous elements for each true cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to the expected number of relative extraneous elements. Expected Relative Number of extraneous elements For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`A_r` be the set of records which are erroneously linked to :math:`r` in the predicted clustering. That is, if :math:`\hat c(r)` is the predicted cluster containing :math:`r`, then :math:`A_r = \hat c(r) \backslash c` Then the expected number of extraneous elements for :math:`c` is .. math:: E_{\text{rel_extra}}(c) = \frac{1}{\lvert c \rvert}\sum_{r\in c} \lvert A_r \rvert / \lvert \hat c(r) \rvert. This is the expected relative number of erroneous links to a random record :math:`r \in c`. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the expected number of extraneous elements. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> expected_relative_extra(prediction, sample) reference c1 0.166667 c2 0.250000 c4 0.666667 Name: expected_relative_extra, dtype: float64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] relevant_predictions = relevant_prediction_subset(prediction, sample) outer = pd.concat( {"prediction": relevant_predictions, "reference": sample}, axis=1, copy=False, join="outer", ) def lambd(sample_cluster): # Number of elements within sampled cluster split across predicted clusters: p = pd.value_counts(sample_cluster) # Number of elements within predicted clusters (restricted to current sampled cluster): u = outer.prediction.value_counts()[p.index].values n_links = np.sum(p * (u - p)) + np.sum(comb(u, 2)) if n_links == 0: return 0 return np.sum(p * (u - p) / u) outer.groupby("reference").agg(lambd) result = outer.groupby("reference").agg(lambd).prediction sizes = sample.groupby(sample).size() result = result / sizes result.rename("expected_relative_extra", inplace=True) return result
[docs]def count_missing(prediction, sample): r""" Count the number of missin elements to sampled clusters. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns the count of missin elements for each true cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to the counts of missin elements. Count of missin elements For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`B_r` be the set of records which are missing from the predicted cluster containing :math:`r`. That is, if :math:`\hat c(r)` is the predicted cluster containing :math:`r`, then :math:`B_r = c \backslash \hat c(r)`. Then the count of missin elements for :math:`c` is .. math:: E_{\text{count_miss}}(c) = \sum_{r\in c} \lvert B_r \rvert. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the count of extraneous elements. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> count_missing(prediction, sample) reference c1 4 c2 2 c4 0 Name: count_missing, dtype: int64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] relevant_predictions = relevant_prediction_subset(prediction, sample) outer = pd.concat( {"prediction": relevant_predictions, "reference": sample}, axis=1, copy=False, join="outer", ) def lambd(sample_cluster): p = pd.value_counts(sample_cluster) n = np.sum(p) n_links = comb(n, 2) if n_links == 0: return 0 return np.sum(p * (n - p)) result = outer.groupby("reference").agg(lambd).prediction result.rename("count_missing", inplace=True) return result
[docs]def expected_missing(prediction, sample): r""" Expected number of missin elements to records in sampled clusters. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns the expected relative number of missin elements for each true cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to the expected number of missin elements. Expected Number of missin elements For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`B_r` be the set of records which are missing from the predicted cluster containing :math:`r`. That is, if :math:`\hat c(r)` is the predicted cluster containing :math:`r`, then :math:`B_r = c \backslash \hat c(r)`. Then the expected number of missin elements for .. math:: E_{\text{miss}}(c) = \frac{1}{\lvert c \rvert}\sum_{r\in c} \lvert B_r \rvert. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the expected number of missin elements. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> expected_missing(prediction, sample) reference c1 1.333333 c2 1.000000 c4 0.000000 Name: expected_missing, dtype: float64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] result = count_missing(prediction, sample) sizes = sample.groupby(sample).size() result = result / sizes result.rename("expected_missing", inplace=True) return result
[docs]def expected_relative_missing(prediction, sample): r""" Expected relative number of missin elements to records in sampled clusters. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns the expected number of missin elements for each true cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to the expected relative number of missin elements. Expected Relative Number of missin elements For a given sampled cluster :math:`c` with records :math:`r \in c`, let :math:`B_r` be the set of records which are missing from the predicted cluster containing :math:`r`. That is, if :math:`\hat c(r)` is the predicted cluster containing :math:`r`, then :math:`B_r = c \backslash \hat c(r)`. Then the expected number of missin elements for :math:`c` is .. math:: E_{\text{rel_miss}}(c) = \frac{1}{\lvert c \rvert}\sum_{r\in c} \lvert B_r \rvert / \lvert c \rvert. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the expected relative number of missin elements. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> expected_relative_missing(prediction, sample) reference c1 0.444444 c2 0.500000 c4 0.000000 Name: expected_relative_missing, dtype: float64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) error_table = record_error_table(prediction, sample) return expected_relative_missing_from_table(error_table)
[docs]def error_indicator(prediction, sample): r""" Error indicator metric. Given a predicted disambiguation ``prediction`` and a sample of true clusters ``sample``, both represented as membership vectors, this functions returns an indicator whether each true cluster matches a predicted cluster. This is a pandas Series indexed by true cluster identifier and with values corresponding to 0 or 1, depending on whether or not the true cluster matches a predicted cluster. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the error indicator. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,5]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> error_indicator(prediction, sample) reference c1 1 c2 1 c4 0 Name: error_indicator, dtype: int64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) error_table = record_error_table(prediction, sample) return error_indicator_from_table(error_table)
[docs]def splitting_entropy(prediction, sample, alpha=1): r""" Splitting entropy of true clusters. This function returns the splitting entropy, defined below, of each entity represented in the sampled clusters `sample`. Splitting Entropy: Let :math:`\hat{\mathcal{C}}` be a clustering of records :math:`\mathcal{R}` into **predicted** entities. For a given entity represented by a cluster :math:`c`, the splitting entropy is defined as the exponentiated Shannon entropy of the set of cluster sizes :math:`\{\lvert \hat c \cap c \rvert \mid \hat c \in \widehat{\mathcal{C}},\, \lvert \hat c \cap c \rvert > 0 \}`. That is, with using the convention that :math:`0 \cdot \log (0) = 0`, we have .. math:: E_{\text{split}}(c) = \exp\left \{-\sum_{\hat c \in \widehat{\mathcal{C}}} \frac{\lvert\hat c \cap c \rvert}{\sum_{\hat c' \in \widehat{\mathcal{C}}} \lvert \hat c' \cap c \rvert } \log \left(\frac{\lvert\hat c \cap c \rvert}{\sum_{\hat c' \in \widehat{\mathcal{C}}} \lvert \hat c' \cap c \rvert }\right) \right \}. Args: prediction (Series): Membership vector representing a predicted disambiguation. sample (Series): Membership vector representing a set of true clusters. Returns: Series: Pandas Series indexed by true cluster identifiers (unique values in `sample`) and with values corresponding to the splitting entropy. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> splitting_entropy(prediction, sample) reference c1 1.889882 c2 2.000000 c4 1.000000 Name: splitting_entropy_1, dtype: float64 Notes: The sample is restricted to the set of records which are present in the prediction. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) sample = sample[sample.index.isin(prediction.index)] relevant_predictions = relevant_prediction_subset(prediction, sample) outer = pd.concat( {"prediction": relevant_predictions, "reference": sample}, axis=1, copy=False, join="outer", ) def lambd(sample_cluster): u = pd.value_counts(sample_cluster, normalize=True).values if len(u) <= 1: return 1 if alpha == 0: return len(u) if alpha == 1: return np.exp(-np.sum(u * np.log(u))) return (np.sum(u**alpha)) ** (1 / (1 - alpha)) result = outer.groupby("reference").agg(lambd).prediction result.rename(f"splitting_entropy_{alpha}", inplace=True) return result