From 0a0369f5e1fd341497eb25610457d7a9751bde0f Mon Sep 17 00:00:00 2001 From: LSaldyt Date: Fri, 12 Jan 2018 14:19:11 -0700 Subject: [PATCH] Generalizes X^2 test to use G value --- copycat/statistics.py | 115 +++++++++++++++++++----------------------- cross_compare.py | 6 +-- 2 files changed, 53 insertions(+), 68 deletions(-) diff --git a/copycat/statistics.py b/copycat/statistics.py index 5a8c660..8c4ce85 100644 --- a/copycat/statistics.py +++ b/copycat/statistics.py @@ -1,7 +1,11 @@ from collections import defaultdict -from pprint import pprint -# CHI2 values for n degrees freedom -_chiSquared_table = { +from pprint import pprint +from math import log + +# comparison values for n degrees freedom +# These values are useable for both the chi^2 and G tests + +_ptable = { 1:3.841, 2:5.991, 3:7.815, @@ -11,13 +15,36 @@ _chiSquared_table = { 7:14.067, 8:15.507, 9:16.919, - 10:18.307 + 10:18.307, + 11:19.7, + 12:21, + 13:22.4, + 14:23.7, + 15:25, + 16:26.3 } -class ChiSquaredException(Exception): - pass +def g_value(actual, expected): + # G = 2 * sum(Oi * ln(Oi/Ei)) + answerKeys = set(list(actual.keys()) + list(expected.keys())) + degreesFreedom = len(answerKeys) + G = 0 -def chi_squared(actual, expected): + get_count = lambda k, d : d[k]['count'] if k in d else 0 + + for k in answerKeys: + E = get_count(k, expected) + O = get_count(k, actual) + if E == 0: + print(' Warning! Expected 0 counts of {}, but got {}'.format(k, O)) + elif O == 0: + print(' Warning! O = {}'.format(O)) + else: + G += O * log(O/E) + G *= 2 + return degreesFreedom, G + +def chi_value(actual, expected): answerKeys = set(list(actual.keys()) + list(expected.keys())) degreesFreedom = len(answerKeys) chiSquared = 0 @@ -33,67 +60,21 @@ def chi_squared(actual, expected): chiSquared += (O - E) ** 2 / E return degreesFreedom, chiSquared -def chi_squared_diff(actual, expected): - df, chiSquared = chi_squared(actual, expected) - return (chiSquared < _chiSquared_table[df]) +def dist_test(actual, expected, calculation): + df, p = calculation(actual, expected) + if df not in _ptable: + raise Exception('{} degrees of freedom does not have a corresponding chi squared value.' + \ + ' Please look up the value and add it to the table in copycat/statistics.py'.format(df)) + return (p < _ptable[df]) -def chi_squared_test(actual, expected, show=True): - df, chiSquared = chi_squared(actual, expected) - - if chiSquared >= _chiSquared_table[df]: - if show: - print('Significant difference between expected and actual answer distributions: \n' + - 'Chi2 value: {} with {} degrees of freedom'.format(chiSquared, df)) - return False - return True - -def cross_formula_chi_squared(actualDict, expectedDict): - failures = [] - for ka, actual in actualDict.items(): - for ke, expected in expectedDict.items(): - print(' Comparing {} with {}: '.format(ka, ke), end='') - if not chi_squared_test(actual, expected, show=False): - failures.append('{}:{}'.format(ka, ke)) - print(' Failed.') - else: - print(' Succeeded.') - return failures - -def cross_formula_chi_squared_table(actualDict, expectedDict): +def cross_formula_table(actualDict, expectedDict, calculation): data = dict() for ka, actual in actualDict.items(): for ke, expected in expectedDict.items(): - #df, chiSquared = chi_squared(actual, expected) - data[(ka, ke)] = chi_squared_diff(actual, expected) - #data[(ka, ke)] = (df, chiSquared) + data[(ka, ke)] = dist_test(actual, expected, calculation) return data -def cross_chi_squared(problemSets): - failures = defaultdict(list) - for i, (a, problemSetA) in enumerate(problemSets): - for b, problemSetB in problemSets[i + 1:]: - for problemA in problemSetA: - for problemB in problemSetB: - if (problemA.initial == problemB.initial and - problemA.modified == problemB.modified and - problemA.target == problemB.target): - answersA = problemA.distributions - answersB = problemB.distributions - print('-' * 80) - print('\n') - print('{} x {}'.format(a, b)) - problemString = '{} x {} for: {}:{}::{}:_\n'.format(a, - b, - problemA.initial, - problemA.modified, - problemA.target) - failures[problemString].append(cross_formula_chi_squared(answersA, answersB)) - pprint(answersA) - pprint(answersB) - print('\n') - return failures - -def cross_chi_squared_table(problemSets): +def cross_table(problemSets, calculation=g_value): table = defaultdict(dict) for i, (a, problemSetA) in enumerate(problemSets): for b, problemSetB in problemSets[i + 1:]: @@ -104,7 +85,11 @@ def cross_chi_squared_table(problemSets): problemA.target == problemB.target): answersA = problemA.distributions answersB = problemB.distributions - table[(problemA.initial, problemA.modified, problemA.target)][(a, b)] = cross_formula_chi_squared_table(answersA, answersB) + table[(problemA.initial, + problemA.modified, + problemA.target)][(a, b)] = ( + cross_formula_table( + answersA, answersB, calculation)) return table def iso_chi_squared(actualDict, expectedDict): @@ -112,4 +97,6 @@ def iso_chi_squared(actualDict, expectedDict): assert key in actualDict, 'The key {} was not tested'.format(key) actual = actualDict[key] expected = expectedDict[key] - chi_squared_test(actual, expected) + if not dist_test(actual, expected, g_value): + raise Exception('Value of G higher than expected') + diff --git a/cross_compare.py b/cross_compare.py index fcaaafd..e0f1a1c 100755 --- a/cross_compare.py +++ b/cross_compare.py @@ -5,7 +5,7 @@ import pickle from pprint import pprint from copycat import Problem -from copycat.statistics import cross_chi_squared, cross_chi_squared_table +from copycat.statistics import cross_table def compare_sets(): pass @@ -18,9 +18,7 @@ def main(args): pSet = pickle.load(infile) branchProblemSets[filename] = pSet problemSets.append((filename, pSet)) - pprint(problemSets) - pprint(cross_chi_squared(problemSets)) - crossTable = cross_chi_squared_table(problemSets) + crossTable = cross_table(problemSets) key_sorted_items = lambda d : sorted(d.items(), key=lambda t:t[0]) tableItems = key_sorted_items(crossTable)