Source code for er_evaluation.estimators._estimators

import numpy as np
import pandas as pd
from scipy.special import comb

from er_evaluation.data_structures import MembershipVector
from er_evaluation.error_analysis import record_error_table
from er_evaluation.estimators._utils import (_parse_weights,
                                             ratio_of_means_estimator,
                                             validate_prediction_sample,
                                             validate_weights)
from er_evaluation.estimators.from_table import (
    b_cubed_precision_estimator_from_table,
    b_cubed_recall_estimator_from_table, cluster_f_estimator_from_table,
    cluster_precision_estimator_from_table,
    cluster_recall_estimator_from_table, pairwise_f_estimator_from_table)
from er_evaluation.utils import expand_grid


def _prepare_args(prediction, sample, weights):
    validate_prediction_sample(prediction, sample)
    sample = sample[sample.index.isin(prediction.index)]

    weights = _parse_weights(sample, weights)
    validate_weights(sample, weights)
    weights = weights[weights.index.isin(sample.values)]

    return prediction, sample, weights


[docs]@ratio_of_means_estimator def pairwise_precision_estimator(prediction, sample, weights): r""" Design estimator for pairwise precision. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a pairwise precision estimate together with its estimated standard deviation. Note: This is the precision estimator corresponding to cluster block sampling in [1]. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. Returns: tuple: Precision estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> weights = pd.Series(1, index=sample.unique()) # Uniform cluster weights >>> pairwise_precision_estimator(prediction, sample, weights) (0.3888888888888889, 0.2545875386086578) References: [1] Binette, Olivier, Sokhna A York, Emma Hickerson, Youngsoo Baek, Sarvo Madhavan, Christina Jones. (2022). Estimating the Performance of Entity Resolution Algorithms: Lessons Learned Through PatentsView.org. arXiv e-prints: arxiv:2210.01230 """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) inner = pd.concat( {"prediction": prediction, "reference": sample}, axis=1, join="inner", copy=False, ) split_cluster_sizes = inner.groupby(["prediction", "reference"]).size() # Number of correctly predicted links (TP) by reference cluster. TP_by_reference = ( split_cluster_sizes.to_frame() .assign(cmb=comb(split_cluster_sizes.values, 2)) .groupby("reference") .sum() .cmb.sort_index() .values ) N = TP_by_reference K = prediction.isin(inner.prediction) def lambd(x): index = inner.prediction.index.isin(x.index) J = prediction[K].isin(inner.prediction[index]) A = inner.prediction[index].value_counts(sort=False).sort_index().values B = prediction[K][J].value_counts(sort=False).sort_index().values return np.sum(A * (B - A)) # Number of falsely predicted links (FP) by reference cluster. FP_by_reference = inner.groupby("reference").apply(lambd) D = TP_by_reference + 0.5 * FP_by_reference sorted_weights = weights.sort_index() N, D = (N * sorted_weights, D * sorted_weights) return N, D
[docs]@ratio_of_means_estimator def pairwise_recall_estimator(prediction, sample, weights): r""" Design estimator for pairwise recall. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a pairwise recall estimate together with its estimated standard deviation. Note: This is the recall estimator corresponding to cluster block sampling in [1]. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. Returns: tuple: Recall estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]) >>> weights = pd.Series(1, index=sample.unique()) # Uniform cluster weights >>> pairwise_recall_estimator(prediction, sample, weights) (0.296875, 0.10825317547305482) References: [1] Binette, Olivier, Sokhna A York, Emma Hickerson, Youngsoo Baek, Sarvo Madhavan, Christina Jones. (2022). Estimating the Performance of Entity Resolution Algorithms: Lessons Learned Through PatentsView.org. arXiv e-prints: arxiv:2210.01230 """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) inner = pd.concat( {"prediction": prediction, "reference": sample}, axis=1, join="inner", copy=False, ) split_cluster_sizes = inner.groupby(["prediction", "reference"]).size() TP_by_reference = ( split_cluster_sizes.to_frame() .assign(cmb=comb(split_cluster_sizes.values, 2)) .groupby("reference") .sum() .cmb.sort_index() .values ) cluster_sizes = inner.reference.value_counts(sort=False).sort_index().values N = TP_by_reference D = comb(cluster_sizes, 2) sorted_weights = weights.sort_index() N, D = (N * sorted_weights, D * sorted_weights) return N, D
[docs]def pairwise_f_estimator(prediction, sample, weights, beta=1.0): """ Design estimator for pairwise F-score. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a pairwise F-score estimate together with its estimated standard deviation. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. beta (float): Weighting parameter for F-score. Default is 1.0. Returns: tuple: F-score estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5], data=["c1", "c1", "c1", "c2", "c2"]) >>> weights = pd.Series(1, index=sample.unique()) # Uniform cluster weights >>> pairwise_f_estimator(prediction, sample, weights) (0.4166666666666667, 0.16666666666666666) """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) error_table = record_error_table(prediction, sample) return pairwise_f_estimator_from_table(error_table, weights, beta)
[docs]def cluster_precision_estimator(prediction, sample, weights): """ Cluster precision design estimator. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a cluster precision estimate together with its estimated standard deviation. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. This should cover the entire target population for which cluster precision is being computed. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. Returns: tuple: Cluster precision estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,6,7, 8], data=["c1", "c1", "c1", "c2", "c2", "c3", "c3", "c3"]) >>> cluster_precision_estimator(prediction, sample, weights="uniform") (0.26171875, 0.23593232610221093) Notes: * This estimator requires ``prediction`` to cover the entire population of interest from which sampled clusters were obtained. Do not subset ``prediction`` in any way. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) error_table = record_error_table(prediction, sample) return cluster_precision_estimator_from_table(error_table, weights, len(prediction), prediction.nunique())
[docs]def cluster_recall_estimator(prediction, sample, weights): """ Cluster recall design estimator. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a cluster recall estimate together with its estimated standard deviation. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. Returns: tuple: Cluster recall estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,6,7, 8], data=["c1", "c1", "c1", "c2", "c2", "c3", "c3", "c3"]) >>> cluster_recall_estimator(prediction, sample, weights="uniform") (0.3333333333333333, 0.3333333333333333) """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) error_table = record_error_table(prediction, sample) return cluster_recall_estimator_from_table(error_table, weights)
[docs]def cluster_f_estimator(prediction, sample, weights, beta=1.0): """ Cluster F-score design estimator. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a cluster F-score estimate together with its estimated standard deviation. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. This should cover the entire target population for which cluster f-score is being computed. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. beta (float): F-score weight. Returns: tuple: Cluster F-score estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5,6,7, 8], data=["c1", "c1", "c1", "c2", "c2", "c3", "c3", "c3"]) >>> cluster_f_estimator(prediction, sample, weights="uniform") (0.29446064139941686, 0.2760765154789527) Notes: * This estimator requires ``prediction`` to cover the entire population of interest from which sampled clusters were obtained. Do not subset ``prediction`` in any way. """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) error_table = record_error_table(prediction, sample) return cluster_f_estimator_from_table(error_table, weights, len(prediction), prediction.nunique(), beta)
[docs]def b_cubed_precision_estimator(prediction, sample, weights): """ B-cubed precision design estimator. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a B-cubed precision estimate together with its estimated standard deviation. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. Returns: tuple: B-cubed precision estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5], data=["c1", "c1", "c1", "c2", "c2"]) >>> weights = pd.Series(1, index=sample.unique()) # Uniform cluster weights >>> b_cubed_precision_estimator(prediction, sample, weights) (0.7916666666666667, 0.0416666666666673) """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) error_table = record_error_table(prediction, sample) return b_cubed_precision_estimator_from_table(error_table, weights)
[docs]def b_cubed_recall_estimator(prediction, sample, weights): """ B-cubed recall design estimator. Given a predicted disambiguation `prediction`, a set of ground truth clusters `sample`, and a set of cluster sampling weights `weights` (e.g., inverse probability weights for each cluster), this returns a B-cubed recall estimate together with its estimated standard deviation. Args: prediction (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. sample (Series): Membership vector indexed by cluster elements and with values corresponding to associated cluster identifier. weights (Series): Pandas Series indexed by cluster identifier and with values corresponding to cluster sampling weights (e.g., inverse sampling probabilities). Can also be the string "uniform" for uniform sampling weights, or "cluster_size" for inverse cluster size sampling weights. Returns: tuple: B-cubed recall estimate and standard deviation estimate. Examples: >>> prediction = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4]) >>> sample = pd.Series(index=[1,2,3,4,5], data=["c1", "c1", "c1", "c2", "c2"]) >>> weights = pd.Series(1, index=sample.unique()) # Uniform cluster weights >>> b_cubed_recall_estimator(prediction, sample, weights) (0.5277777777777778, 0.027777777777778203) """ prediction = MembershipVector(prediction, dropna=True) sample = MembershipVector(sample, dropna=True) prediction, sample, weights = _prepare_args(prediction, sample, weights) error_table = record_error_table(prediction, sample) return b_cubed_recall_estimator_from_table(error_table, weights)
DEFAULT_ESTIMATORS = { "pairwise_precision": pairwise_precision_estimator, "pairwise_recall": pairwise_recall_estimator, "pairwise_f": pairwise_f_estimator, "cluster_precision": cluster_precision_estimator, "cluster_recall": cluster_recall_estimator, "cluster_f": cluster_f_estimator, "b_cubed_precision": b_cubed_precision_estimator, "b_cubed_recall": b_cubed_recall_estimator, }
[docs]def estimates_table(predictions, samples_weights, estimators=DEFAULT_ESTIMATORS): """ Create table of estimates applied to all combinations of predictions and (sample, weights) pairs. Args: predictions (Dict): Dictionary of membership vectors. samples_weights (Dict): Dictionary of dictionaries of the form {"sample": sample, "weights": weights}, where `sample` is the sample membership vector and `weights` is the pandas Series of sampling weights. See estimators definitions for more information. estimators (Dict): Dictionary of estimator functions. Each estimator is expected to return a pair (estimate, std). Returns: DataFrame: Pandas DataFrame with columns "predition", "sample_weights", "estimator", "value", and "std", where value and std are the point estimate and standard deviation estimate for the estimator applied to the given prediction, sample and sampling weights. Examples: >>> import pandas as pd >>> from er_evaluation.estimators import * >>> predictions = {"prediction_1": pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])} >>> samples_weights = {"sample_1": {"sample": pd.Series(index=[1,2,3,4,5,8], data=["c1", "c1", "c1", "c2", "c2", "c4"]), "weights": pd.Series(1, index=["c1", "c2", "c4"])}} >>> estimators = {"precision": pairwise_precision_estimator, "recall": pairwise_recall_estimator} >>> estimates_table(predictions, samples_weights, estimators) # doctest: +NORMALIZE_WHITESPACE prediction sample_weights estimator value std 0 prediction_1 sample_1 precision 0.388889 0.254588 1 prediction_1 sample_1 recall 0.296875 0.108253 """ params = expand_grid( prediction=predictions, sample_weights=samples_weights, estimator=estimators, ) def lambd(pred_key, ref_key, est_key): ests = estimators[est_key]( predictions[pred_key], samples_weights[ref_key]["sample"], samples_weights[ref_key]["weights"], ) return ests params[["value", "std"]] = pd.DataFrame( params.apply( lambda x: lambd(x["prediction"], x["sample_weights"], x["estimator"]), axis=1, ).tolist(), index=params.index, ) return params