Corrected Cramer's V results in division by zero when n = r

import scipy.stats as ss def cramers_corrected_stat(confusion_matrix): """ calculate Cramers V statistic for categorial-categorial association. uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328 """ chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum() phi2 = chi2/n r,k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1)) rcorr = r - ((r-1)**2)/(n-1) kcorr = k - ((k-1)**2)/(n-1) return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1))))

import pandas as pd data = [ {'name': 'Alice', 'occupation': 'therapist', 'favorite_color': 'red'}, {'name': 'Bob', 'occupation': 'fisherman', 'favorite_color': 'blue'}, {'name': 'Carol', 'occupation': 'scientist', 'favorite_color': 'orange'}, {'name': 'Doug', 'occupation': 'scientist', 'favorite_color': 'red'}, ] df = pd.DataFrame(data) confusion_matrix = pd.crosstab(df['name'], df['occupation']) # n = 4 (number of samples), r = 4 (number of unique names), k = 3 (number of unique occupations) print(cramers_corrected_stat(confusion_matrix))

You can handle the division by zero when n=r by introducing a small perturbation. I modified your function this way:

Your original function:

def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1))))

becomes

def cramers_corrected_stat(confusion_matrix):
    """Calculate Cramers V statistic for categorical-categorical association.
       Uses correction from Bergsma and Wicher,
       Journal of the Korean Statistical Society 42 (2013): 323-328.
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()  
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    
    denominator = min((kcorr-1), (rcorr-1))
    if denominator <= 0:
        return 0
    else:
        return np.sqrt(phi2corr / denominator)

Whith your sample data (with n = 4, r = 4, k=3):

data = [
    {'name': 'Alice', 'occupation': 'therapist', 'favorite_color': 'red'},
    {'name': 'Bob', 'occupation': 'fisherman', 'favorite_color': 'blue'},
    {'name': 'Carol', 'occupation': 'scientist', 'favorite_color': 'orange'},
    {'name': 'Doug', 'occupation': 'scientist', 'favorite_color': 'red'},
]

df = pd.DataFrame(data)

confusion_matrix = pd.crosstab(df['name'], df['occupation']) 
result = cramers_corrected_stat(confusion_matrix)
print(f"Cramer's V Result: {result}")

you'd get

Cramer's V Result: 0

To handle the corner case where n = k = r I update the function with

import numpy as np
import pandas as pd
import scipy.stats as ss

def cramers_corrected_stat(confusion_matrix):
    """Calculate Cramers V statistic for categorical-categorical association.
       Uses correction from Bergsma and Wicher,
       Journal of the Korean Statistical Society 42 (2013): 323-328.
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()  
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    
    if rcorr <= 1 and kcorr <= 1: 
        return 0
    
    denominator = min((kcorr-1), (rcorr-1))
    if denominator <= 0:
        return 1
    else:
        return np.sqrt(phi2corr / denominator)

# Sample data
data = [
    {'name': 'Alice', 'occupation': 'therapist', 'favorite_color': 'red'},
    {'name': 'Bob', 'occupation': 'fisherman', 'favorite_color': 'blue'},
    {'name': 'Carol', 'occupation': 'scientist', 'favorite_color': 'orange'},
    {'name': 'Doug', 'occupation': 'scientist', 'favorite_color': 'red'},
]

df = pd.DataFrame(data)

confusion_matrix = pd.crosstab(df['name'], df['occupation']) 
result = cramers_corrected_stat(confusion_matrix)
print(f"Cramer's V Result: {result}")

Recommended topics

Hot tags