Source code for er_evaluation.utils._utils

import itertools
import logging
from importlib import resources

import numpy as np
import pandas as pd

from er_evaluation.data_structures import MembershipVector


[docs]def load_module_parquet(module, filename):
    """
    Load parquet file from a submodule using pyarrow engine.

    Args:
        module (string): Path to a module, such as "er_evaluation.datasets.raw_data.rldata"
        filename (string): Name of the parquet file.

    Returns:
        pandas DataFrame
    """
    with resources.open_binary(module, filename) as f:
        data = pd.read_parquet(f, engine="pyarrow")

    return data


[docs]def load_module_tsv(module, filename, dtype=str):
    """
    Load tsv file from a submodule.

    Args:
        module (string): Path to a module, such as "er_evaluation.datasets.raw_data.rldata"
        filename (string): Name of the tsv file.
        dtype: Data type to use to read the file. Defaults to str.

    Returns:
        pandas DataFrame
    """
    with resources.open_text(module, filename) as f:
        data = pd.read_csv(f, sep="\t", dtype=dtype)

    return data


[docs]def sample_clusters(membership, weights="uniform", sample_prop=0.2, size=None, replace=True, random_state=1):
    """
    Sample clusters from a membership vector.

    Args:
        membership (Series): Membership vector.
        weights (str, optional): Probability weights to use. Should be one "uniform", "cluster_size", or a pandas Series indexed by cluster identifiers and with values corresponding to probability weights. Defaults to "uniform".
        sample_prop (float, optional): Proportion of clusters to sample. Defaults to 0.2.
        replace (bool, optional): Wether or not to sample with replacement. Defaults to True.
        random_state (int, optional): Random seed. Defaults to 1.

    Returns:
        Series: Membership vector with elements corresponding to sampled clusters.

    Examples:

        Load a toy dataset:

        >>> from er_evaluation.datasets import load_rldata10000_disambiguations
        >>> predictions, reference = load_rldata10000_disambiguations()

        Sample a set of ground truth clusters uniformly at random:

        >>> sample = sample_clusters(reference, weights="uniform", sample_prop=0.2)

        Compute pairwise_precision on the sample:

        >>> from er_evaluation.metrics import pairwise_precision
        >>> pairwise_precision(predictions['name_by'], sample)
        0.96

        Compare to the true precision on the full data:

        >>> pairwise_precision(predictions['name_by'], reference)
        0.7028571428571428

        The metric computed on a sample is over-optimistic (0.96 versus true precision of 0.7). Instead, use an estimator to accurately estimate pairwise precision from a sample, which returns a point estimate and its standard deviation estimate:

        >>> from er_evaluation.estimators import pairwise_precision_estimator
        >>> pairwise_precision_estimator(predictions['name_by'], sample, weights="uniform")
        (0.7633453805063894, 0.04223296142335369)
    """
    membership = MembershipVector(membership)
    np.random.seed(random_state)

    if size is not None:
        sample_size = size
    else:
        sample_size = int(sample_prop * membership.nunique())

    if isinstance(weights, pd.Series):
        selected_clusters = np.random.choice(
            weights.index,
            size=sample_size,
            replace=replace,
            p=weights.values / np.sum(weights.values),
        )
    elif isinstance(weights, str):
        if weights == "uniform":
            selected_clusters = np.random.choice(membership.unique(), size=sample_size, replace=replace)
        elif weights == "cluster_size":
            selected_clusters = np.random.choice(
                membership.values,
                size=sample_size,
                replace=replace,
            )
        else:
            raise ValueError(
                f"Invalid weights argument. Valid strings are 'uniform' or 'cluster_size', instead got {weights}"
            )
    else:
        raise ValueError(
            f"Invalid weights argument. Should be a string or a pandas Series, instead got type {type(weights)}."
        )

    return membership[membership.isin(selected_clusters)]


[docs]def relevant_prediction_subset(prediction, sample):
    """Return predicted clusters which intersect sampled clusters."""
    prediction = MembershipVector(prediction)
    sample = MembershipVector(sample)

    index = prediction.index.isin(sample.index)
    J = prediction.isin(prediction[index].values)

    relevant_prediction = prediction[J]
    if len(relevant_prediction) == 0:
        logging.warning("Relevant prediction subset is empty: predicted clusters do not overlap sample clusters.")

    return relevant_prediction


[docs]def expand_grid(**kwargs):
    """
    Create DataFrame from all combination of elements.

    Args:
        kwargs: Dictionary of elements to combine. Keys become column names.

    Returns:
        DataFrame: DataFrame with columns corresponding to argument names and rows for each combination of argument values.

    Examples:
        >>> expand_grid(col1=[1,2], col2=["a", "b"])
           col1 col2
        0     1    a
        1     1    b
        2     2    a
        3     2    b

        >>> expand_grid(col1={1:"something", 2:"something"}, col2=["a", "b"])
           col1 col2
        0     1    a
        1     1    b
        2     2    a
        3     2    b
    """
    return pd.DataFrame.from_records(itertools.product(*kwargs.values()), columns=kwargs.keys())