Source code for er_evaluation.datasets.rldata

from er_evaluation.utils import load_module_tsv

DATA_MODULE = "er_evaluation.datasets.raw_data"
RLDATA_MODULE = DATA_MODULE + ".rldata"


def _make_rldisambiguations(rldata):
    """
    Create four predicted disambiguations for RLdata.

    These are toy disambiguations meant to showcase features of the ER-Evaluation package. None of them are very accurate.

    The four disambiguations are:

    * **name**: Disambiguation based on exact matching first name and last name.
    * **name_by**: Disambiguation based on exact matching first name, last name, and birth year.
    * **name_bm**: Disambiguation based on exact matching first name, last name, and birth month.
    * **name_bd**: Disambiguation based on exact matching first name, last name, and birth day.

    Args:
        rldata (DataFrame): RLdata500 or RLdata10000 dataframe (see :meth:`er_evaluation.datasets.load_rldata500`).

    Returns:
        dict: Dictionary with the four disambiguations.
    """
    disamb_name = rldata["fname_c1"] + " " + rldata["lname_c1"]
    disamb_name_by = disamb_name + " " + rldata["by"]
    disamb_name_bm = disamb_name + " " + rldata["bm"]
    disamb_name_bd = disamb_name + " " + rldata["bd"]

    return dict(name=disamb_name, name_by=disamb_name_by, name_bm=disamb_name_bm, name_bd=disamb_name_bd)


[docs]def load_rldata500(): """ Load RLdata500 dataset. Dataset with 500 rows, including 50 noisy duplicate records, from the `RecordLinkage <https://cran.r-project.org/web/packages/RecordLinkage/RecordLinkage.pdf>`_ R package. Unique identifiers for each row can be obtained from :meth:`er_evaluation.datasets.load_rldata500_disambiguations`. Columns are: * **fname_c1**: First name, first component. * **fname_c2**: First name, second component. * **lname_c1**: Last name, first component. * **lname_c2**: Last name, second component. * **by**: Year of birth. * **bm**: Month of birth. * **bd**: Day of birth. Returns: DataFrame: RLdata500 dataset. """ rldata500 = load_module_tsv(RLDATA_MODULE, "RLdata500.tsv", dtype=str) return rldata500
[docs]def load_rldata500_disambiguations(): """ Load reference and predicted disambiguations for the RLdata500 dataset. The reference disambiguation is the series of true unique identifiers for RLdata500. Predicted disambiguations are a set of four toy disambiguations meant to showcase and test features of this package. The four predicted disambiguations are: * **name**: Disambiguation based on exact matching first name and last name. * **name_by**: Disambiguation based on exact matching first name, last name, and birth year. * **name_bm**: Disambiguation based on exact matching first name, last name, and birth month. * **name_bd**: Disambiguation based on exact matching first name, last name, and birth day. These are returned in a dictionary with the above named elements. Returns: tuple: tuple of the form ``(predictions, reference)``, where ``reference`` is the ground truth disambiguation and ``predictions`` is a dictionary with four toy disambiguations. Examples: Load ground truth and the set of four toy predictions: >>> predictions, reference = load_rldata500_disambiguations() Compute pairwise precision for each prediction: >>> from er_evaluation.metrics import pairwise_precision >>> pairwise_precision(predictions["name"], reference) 0.4523809523809524 >>> pairwise_precision(predictions["name_by"], reference) 1.0 >>> pairwise_precision(predictions["name_bm"], reference) 0.7619047619047619 >>> pairwise_precision(predictions["name_bd"], reference) 1.0 """ rldata500 = load_rldata500() reference = load_module_tsv(RLDATA_MODULE, "identity.RLdata500.tsv").iloc[:, 0] disambiguations = _make_rldisambiguations(rldata500) return disambiguations, reference
[docs]def load_rldata10000(): """ Load RLdata10000 dataset. Dataset with 10000 rows, including 1000 noisy duplicate records, from the `RecordLinkage <https://cran.r-project.org/web/packages/RecordLinkage/RecordLinkage.pdf>`_ R package. Unique identifiers for each row can be obtained from :meth:`er_evaluation.datasets.load_rldata500_disambiguations`. Columns are: * **fname_c1**: First name, first component. * **fname_c2**: First name, second component. * **lname_c1**: Last name, first component. * **lname_c2**: Last name, second component. * **by**: Year of birth. * **bm**: Month of birth. * **bd**: Day of birth. Returns: DataFrame: RLdata500 dataset. """ rldata10000 = load_module_tsv(RLDATA_MODULE, "RLdata10000.tsv", dtype=str) return rldata10000
[docs]def load_rldata10000_disambiguations(): """ Load reference and predicted disambiguations for the RLdata10000 dataset. The reference disambiguation is the series of true unique identifiers for RLdata10000. Predicted disambiguations are a set of four toy disambiguations meant to showcase and test features of this package. The four predicted disambiguations are: * **name**: Disambiguation based on exact matching first name and last name. * **name_by**: Disambiguation based on exact matching first name, last name, and birth year. * **name_bm**: Disambiguation based on exact matching first name, last name, and birth month. * **name_bd**: Disambiguation based on exact matching first name, last name, and birth day. These are returned in a dictionary with the above named elements. Returns: tuple: tuple of the form ``(predictions, reference)``, where ``reference`` is the ground truth disambiguation and ``predictions`` is a dictionary with four toy disambiguations. Examples: Load ground truth and the set of four toy predictions: >>> predictions, reference = load_rldata10000_disambiguations() Compute pairwise precision for each prediction: >>> from er_evaluation.metrics import pairwise_precision >>> pairwise_precision(predictions["name"], reference) 0.04653923780125846 >>> pairwise_precision(predictions["name_by"], reference) 0.7028571428571428 >>> pairwise_precision(predictions["name_bm"], reference) 0.3076086956521739 >>> pairwise_precision(predictions["name_bd"], reference) 0.501937984496124 """ rldata10000 = load_rldata10000() reference = load_module_tsv(RLDATA_MODULE, "identity.RLdata10000.tsv").iloc[:, 0] disambiguations = _make_rldisambiguations(rldata10000) return disambiguations, reference