import numpy as np
import pandas as pd
from scipy.special import comb
from er_evaluation.data_structures import MembershipVector
[docs]def summary_statistics(membership, names=None):
r"""
Compute canonical set of summary statistics.
This includes:
* Number of clusters
* Average cluster size
* Matching rate
* Hill numbers of order 0, 1, and 2
If names are provided for each cluster elements then the following two statistics are also provided:
* Homonymy rate (proportion of clusters where at least one name is shared with another cluster)
* Name variation rate (proportion of clusters with name variation within them)
Args:
membership (Series): Membership vector representation of a clustering.
names (Series): Names associated with each cluster elements. Defaults to None.
Returns:
dict: Dictionary of summary statistics.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> summary_statistics(membership)
{'number_of_clusters': 4, 'average_cluster_size': 2.0, 'matching_rate': 0.875, 'H0': 3, 'H1': 2.82842712474619, 'H2': 2.6666666666666665}
"""
membership = MembershipVector(membership)
statistics = {
"number_of_clusters": number_of_clusters(membership),
"average_cluster_size": average_cluster_size(membership),
"matching_rate": matching_rate(membership),
"H0": cluster_hill_number(membership, alpha=0),
"H1": cluster_hill_number(membership, alpha=1),
"H2": cluster_hill_number(membership, alpha=2),
}
if names is not None:
statistics.update(
{
"homonymy_rate": homonymy_rate(membership, names),
"name_variation_rate": name_variation_rate(membership, names),
}
)
return statistics
[docs]def number_of_clusters(membership):
r"""
Number of clusters in a given clustering.
Args:
membership (Series): Membership vector representation of a clustering.
Returns:
int: number of unique cluster identifiers. Note that NAs are not counted.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> number_of_clusters(membership)
4
"""
membership = MembershipVector(membership)
return membership.nunique()
[docs]def number_of_links(membership):
r"""
Number of pairwise links associated with a given clustering.
Args:
membership (Series): Membership vector representation of a clustering.
Returns:
int: Number of pairs of elements belonging to the same cluster. Note that clusters identified by NA values are excluded.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> number_of_links(membership)
5.0
"""
membership = MembershipVector(membership)
return np.sum(comb(cluster_sizes(membership), 2))
[docs]def matching_rate(membership):
r"""
Compute the **matching rate** for a given clustering.
Matching rate:
This is the proportion of elements belonging to clusters of size at least 2.
Args:
membership (Series): Membership vector representation of a clustering.
Returns:
int: Number of pairs of elements belonging to the same cluster. Note that clusters identified by NA values are excluded.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> matching_rate(membership)
0.875
"""
membership = MembershipVector(membership)
counts = membership.groupby(membership).count()
return (counts * (counts > 1)).sum() / membership.count()
[docs]def average_cluster_size(membership):
"""
Compute the average cluster size.
Args:
membership (Series): Membership vector representation of a clustering.
Returns:
float: Average cluster size.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> average_cluster_size(membership)
2.0
"""
membership = MembershipVector(membership)
return cluster_sizes(membership).mean()
[docs]def cluster_sizes(membership):
r"""
Compute the size of each cluster.
Args:
membership (Series): Membership vector representation of a clustering.
Returns:
Series: Series indexed by cluster identifier and with values corresponding to cluster size. Note that NA cluster identifiers are excluded.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> cluster_sizes(membership)
1 2
2 2
3 1
4 3
dtype: int64
"""
membership = MembershipVector(membership)
return membership.groupby(membership).count()
[docs]def cluster_sizes_distribution(membership):
r"""
Compute the cluster size distribution
Args:
membership (Series): Membership vector representation of a clustering.
Returns:
Series: Pandas Series indexed by distinct cluster sizes and with values corresponding to the number of clusters of that size.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> cluster_sizes_distribution(membership)
1 1
2 2
3 1
dtype: int64
"""
membership = MembershipVector(membership)
cs = cluster_sizes(membership)
return cs.groupby(cs).count()
[docs]def cluster_hill_number(membership, alpha=1):
r"""
Compute Hill number of a given order.
Hill numbers:
The Hill number of order :math:`\alpha \geq 0` of a given probability distribution :math:`p_i`, :math:`i =0,1,2, \dots`, is defined as
.. math::
H_\alpha = \left(\sum_{i} p_i^{\alpha} \right)^{1/(1-\alpha)}
and continually extended at :math:`\alpha =0, 1`. Here, we let :math:`p_i` be the proportion of clusters of size :math:`i`.
Args:
membership (Series): Membership vector representation of a clustering.
alpha (int, optional): Order of the Hill Number. Defaults to 1.
Returns:
float: Hill number of order `alpha` for the given clustering.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> cluster_hill_number(membership, alpha=0)
3
>>> cluster_hill_number(membership, alpha=1)
2.82842712474619
>>> cluster_hill_number(membership, alpha=np.Inf)
2.0
"""
membership = MembershipVector(membership)
cs_dist = cluster_sizes_distribution(membership)
probs = cs_dist / np.sum(cs_dist)
probs = probs[probs > 0]
if alpha == 0:
return len(probs)
if alpha == 1:
return np.exp(-np.sum(probs * np.log(probs)))
if alpha == np.Inf:
return 1 / np.max(probs)
else:
return np.sum(probs**alpha) ** (1 / (1 - alpha))
[docs]def homonymy_rate(membership, names):
r"""
Compute the homonymy rate of a given clustering with a set of associated names.
Homonymy rate:
The homonymy rate is the proportion of clusters that share a name with another cluster.
Args:
membership (Series): Membership vector representation of a clustering.
names (Series): Series indexed by cluster elements and with values corresponding to the associated name. Note that the index of `membership` should be included in the index of `names`.
Returns:
float: homonymy rate
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> names = pd.Series(index=[1,2,3,4,5,6,7,8], data=["n1", "n2", "n3", "n4", "n3", "n1", "n2", "n8"])
>>> homonymy_rate(membership, names)
0.5
"""
membership = MembershipVector(membership)
assert isinstance(names, pd.Series)
assert all(membership.index.isin(names.index))
df = pd.concat(
{"membership": membership, "name": names},
axis=1,
join="inner",
copy=False,
)
names_count = df.name.groupby(df.name).count().reset_index(name="total_count")
name_count_per_cluster = df.groupby(["name", "membership"]).size().reset_index(name="cluster_count")
merged = name_count_per_cluster.merge(
names_count,
on="name",
copy=False,
validate="m:1",
)
merged["diff"] = merged.total_count - merged.cluster_count
return (merged.groupby("membership").agg({"diff": "max"}) > 0).mean().values[0]
[docs]def name_variation_rate(membership, names):
r"""
Compute the name variation rate of a given clustering with a set of associated names.
Name variation rate:
The name variation rate is the proportion of clusters with name variation within.
Args:
membership (Series): Membership vector representation of a clustering.
names (Series): Series indexed by cluster elements and with values corresponding to the associated name. Note that the index of `names` should exactly match the index of `membership`.
Returns:
float: Name variation rate.
Examples:
>>> membership = pd.Series(index=[1,2,3,4,5,6,7,8], data=[1,1,2,3,2,4,4,4])
>>> names = pd.Series(index=[1,2,3,4,5,6,7,8], data=["n1", "n2", "n3", "n4", "n3", "n1", "n2", "n8"])
>>> name_variation_rate(membership, names)
0.5
"""
membership = MembershipVector(membership)
assert isinstance(names, pd.Series)
assert all(membership.index.isin(names.index))
joined = pd.concat(
{"membership": membership, "name": names},
axis=1,
join="inner",
copy=False,
)
return (joined.groupby("membership").nunique() > 1).mean().values[0]