import itertools
import logging
from importlib import resources
import numpy as np
import pandas as pd
from er_evaluation.data_structures import MembershipVector
[docs]def load_module_parquet(module, filename):
"""
Load parquet file from a submodule using pyarrow engine.
Args:
module (string): Path to a module, such as "er_evaluation.datasets.raw_data.rldata"
filename (string): Name of the parquet file.
Returns:
pandas DataFrame
"""
with resources.open_binary(module, filename) as f:
data = pd.read_parquet(f, engine="pyarrow")
return data
[docs]def load_module_tsv(module, filename, dtype=str):
"""
Load tsv file from a submodule.
Args:
module (string): Path to a module, such as "er_evaluation.datasets.raw_data.rldata"
filename (string): Name of the tsv file.
dtype: Data type to use to read the file. Defaults to str.
Returns:
pandas DataFrame
"""
with resources.open_text(module, filename) as f:
data = pd.read_csv(f, sep="\t", dtype=dtype)
return data
[docs]def sample_clusters(membership, weights="uniform", sample_prop=0.2, size=None, replace=True, random_state=1):
"""
Sample clusters from a membership vector.
Args:
membership (Series): Membership vector.
weights (str, optional): Probability weights to use. Should be one "uniform", "cluster_size", or a pandas Series indexed by cluster identifiers and with values corresponding to probability weights. Defaults to "uniform".
sample_prop (float, optional): Proportion of clusters to sample. Defaults to 0.2.
replace (bool, optional): Wether or not to sample with replacement. Defaults to True.
random_state (int, optional): Random seed. Defaults to 1.
Returns:
Series: Membership vector with elements corresponding to sampled clusters.
Examples:
Load a toy dataset:
>>> from er_evaluation.datasets import load_rldata10000_disambiguations
>>> predictions, reference = load_rldata10000_disambiguations()
Sample a set of ground truth clusters uniformly at random:
>>> sample = sample_clusters(reference, weights="uniform", sample_prop=0.2)
Compute pairwise_precision on the sample:
>>> from er_evaluation.metrics import pairwise_precision
>>> pairwise_precision(predictions['name_by'], sample)
0.96
Compare to the true precision on the full data:
>>> pairwise_precision(predictions['name_by'], reference)
0.7028571428571428
The metric computed on a sample is over-optimistic (0.96 versus true precision of 0.7). Instead, use an estimator to accurately estimate pairwise precision from a sample, which returns a point estimate and its standard deviation estimate:
>>> from er_evaluation.estimators import pairwise_precision_estimator
>>> pairwise_precision_estimator(predictions['name_by'], sample, weights="uniform")
(0.7633453805063894, 0.04223296142335369)
"""
membership = MembershipVector(membership)
np.random.seed(random_state)
if size is not None:
sample_size = size
else:
sample_size = int(sample_prop * membership.nunique())
if isinstance(weights, pd.Series):
selected_clusters = np.random.choice(
weights.index,
size=sample_size,
replace=replace,
p=weights.values / np.sum(weights.values),
)
elif isinstance(weights, str):
if weights == "uniform":
selected_clusters = np.random.choice(membership.unique(), size=sample_size, replace=replace)
elif weights == "cluster_size":
selected_clusters = np.random.choice(
membership.values,
size=sample_size,
replace=replace,
)
else:
raise ValueError(
f"Invalid weights argument. Valid strings are 'uniform' or 'cluster_size', instead got {weights}"
)
else:
raise ValueError(
f"Invalid weights argument. Should be a string or a pandas Series, instead got type {type(weights)}."
)
return membership[membership.isin(selected_clusters)]
[docs]def relevant_prediction_subset(prediction, sample):
"""Return predicted clusters which intersect sampled clusters."""
prediction = MembershipVector(prediction)
sample = MembershipVector(sample)
index = prediction.index.isin(sample.index)
J = prediction.isin(prediction[index].values)
relevant_prediction = prediction[J]
if len(relevant_prediction) == 0:
logging.warning("Relevant prediction subset is empty: predicted clusters do not overlap sample clusters.")
return relevant_prediction
[docs]def expand_grid(**kwargs):
"""
Create DataFrame from all combination of elements.
Args:
kwargs: Dictionary of elements to combine. Keys become column names.
Returns:
DataFrame: DataFrame with columns corresponding to argument names and rows for each combination of argument values.
Examples:
>>> expand_grid(col1=[1,2], col2=["a", "b"])
col1 col2
0 1 a
1 1 b
2 2 a
3 2 b
>>> expand_grid(col1={1:"something", 2:"something"}, col2=["a", "b"])
col1 col2
0 1 a
1 1 b
2 2 a
3 2 b
"""
return pd.DataFrame.from_records(itertools.product(*kwargs.values()), columns=kwargs.keys())