Source code for er_evaluation.datasets.rldata

from er_evaluation.utils import load_module_tsv

DATA_MODULE = "er_evaluation.datasets.raw_data"
RLDATA_MODULE = DATA_MODULE + ".rldata"


def _make_rldisambiguations(rldata):
    """
    Create four predicted disambiguations for RLdata.

    These are toy disambiguations meant to showcase features of the ER-Evaluation package. None of them are very accurate.

    The four disambiguations are:

    * **name**: Disambiguation based on exact matching first name and last name.
    * **name_by**: Disambiguation based on exact matching first name, last name, and birth year.
    * **name_bm**: Disambiguation based on exact matching first name, last name, and birth month.
    * **name_bd**: Disambiguation based on exact matching first name, last name, and birth day.

    Args:
        rldata (DataFrame): RLdata500 or RLdata10000 dataframe (see :meth:`er_evaluation.datasets.load_rldata500`).

    Returns:
        dict: Dictionary with the four disambiguations.
    """
    disamb_name = rldata["fname_c1"] + " " + rldata["lname_c1"]
    disamb_name_by = disamb_name + " " + rldata["by"]
    disamb_name_bm = disamb_name + " " + rldata["bm"]
    disamb_name_bd = disamb_name + " " + rldata["bd"]

    return dict(name=disamb_name, name_by=disamb_name_by, name_bm=disamb_name_bm, name_bd=disamb_name_bd)


[docs]def load_rldata500():
    """
    Load RLdata500 dataset.

    Dataset with 500 rows, including 50 noisy duplicate records, from the `RecordLinkage <https://cran.r-project.org/web/packages/RecordLinkage/RecordLinkage.pdf>`_ R package.

    Unique identifiers for each row can be obtained from :meth:`er_evaluation.datasets.load_rldata500_disambiguations`.

    Columns are:

    * **fname_c1**: First name, first component.
    * **fname_c2**: First name, second component.
    * **lname_c1**: Last name, first component.
    * **lname_c2**: Last name, second component.
    * **by**: Year of birth.
    * **bm**: Month of birth.
    * **bd**: Day of birth.

    Returns:
        DataFrame: RLdata500 dataset.
    """
    rldata500 = load_module_tsv(RLDATA_MODULE, "RLdata500.tsv", dtype=str)

    return rldata500


[docs]def load_rldata500_disambiguations():
    """
    Load reference and predicted disambiguations for the RLdata500 dataset.

    The reference disambiguation is the series of true unique identifiers for RLdata500.

    Predicted disambiguations are a set of four toy disambiguations meant to showcase and test features of this package. The four predicted disambiguations are:

    * **name**: Disambiguation based on exact matching first name and last name.
    * **name_by**: Disambiguation based on exact matching first name, last name, and birth year.
    * **name_bm**: Disambiguation based on exact matching first name, last name, and birth month.
    * **name_bd**: Disambiguation based on exact matching first name, last name, and birth day.

    These are returned in a dictionary with the above named elements.

    Returns:
        tuple: tuple of the form ``(predictions, reference)``, where ``reference`` is the ground truth disambiguation and ``predictions`` is a dictionary with four toy disambiguations.

    Examples:

        Load ground truth and the set of four toy predictions:

        >>> predictions, reference = load_rldata500_disambiguations()

        Compute pairwise precision for each prediction:

        >>> from er_evaluation.metrics import pairwise_precision
        >>> pairwise_precision(predictions["name"], reference)
        0.4523809523809524

        >>> pairwise_precision(predictions["name_by"], reference)
        1.0

        >>> pairwise_precision(predictions["name_bm"], reference)
        0.7619047619047619

        >>> pairwise_precision(predictions["name_bd"], reference)
        1.0
    """
    rldata500 = load_rldata500()
    reference = load_module_tsv(RLDATA_MODULE, "identity.RLdata500.tsv").iloc[:, 0]

    disambiguations = _make_rldisambiguations(rldata500)

    return disambiguations, reference


[docs]def load_rldata10000():
    """
    Load RLdata10000 dataset.

    Dataset with 10000 rows, including 1000 noisy duplicate records, from the `RecordLinkage <https://cran.r-project.org/web/packages/RecordLinkage/RecordLinkage.pdf>`_ R package.

    Unique identifiers for each row can be obtained from :meth:`er_evaluation.datasets.load_rldata500_disambiguations`.

    Columns are:

    * **fname_c1**: First name, first component.
    * **fname_c2**: First name, second component.
    * **lname_c1**: Last name, first component.
    * **lname_c2**: Last name, second component.
    * **by**: Year of birth.
    * **bm**: Month of birth.
    * **bd**: Day of birth.

    Returns:
        DataFrame: RLdata500 dataset.
    """
    rldata10000 = load_module_tsv(RLDATA_MODULE, "RLdata10000.tsv", dtype=str)

    return rldata10000


[docs]def load_rldata10000_disambiguations():
    """
    Load reference and predicted disambiguations for the RLdata10000 dataset.

    The reference disambiguation is the series of true unique identifiers for RLdata10000.

    Predicted disambiguations are a set of four toy disambiguations meant to showcase and test features of this package. The four predicted disambiguations are:

    * **name**: Disambiguation based on exact matching first name and last name.
    * **name_by**: Disambiguation based on exact matching first name, last name, and birth year.
    * **name_bm**: Disambiguation based on exact matching first name, last name, and birth month.
    * **name_bd**: Disambiguation based on exact matching first name, last name, and birth day.

    These are returned in a dictionary with the above named elements.

    Returns:
        tuple: tuple of the form ``(predictions, reference)``, where ``reference`` is the ground truth disambiguation and ``predictions`` is a dictionary with four toy disambiguations.

    Examples:

        Load ground truth and the set of four toy predictions:

        >>> predictions, reference = load_rldata10000_disambiguations()

        Compute pairwise precision for each prediction:

        >>> from er_evaluation.metrics import pairwise_precision
        >>> pairwise_precision(predictions["name"], reference)
        0.04653923780125846

        >>> pairwise_precision(predictions["name_by"], reference)
        0.7028571428571428

        >>> pairwise_precision(predictions["name_bm"], reference)
        0.3076086956521739

        >>> pairwise_precision(predictions["name_bd"], reference)
        0.501937984496124
    """
    rldata10000 = load_rldata10000()
    reference = load_module_tsv(RLDATA_MODULE, "identity.RLdata10000.tsv").iloc[:, 0]

    disambiguations = _make_rldisambiguations(rldata10000)

    return disambiguations, reference