diff --git a/copycat/statistics.py b/copycat/statistics.py index 8c4ce85..bd50e83 100644 --- a/copycat/statistics.py +++ b/copycat/statistics.py @@ -24,17 +24,18 @@ _ptable = { 16:26.3 } + +_get_count = lambda k, d : d[k]['count'] if k in d else 0 + def g_value(actual, expected): # G = 2 * sum(Oi * ln(Oi/Ei)) answerKeys = set(list(actual.keys()) + list(expected.keys())) degreesFreedom = len(answerKeys) G = 0 - get_count = lambda k, d : d[k]['count'] if k in d else 0 - for k in answerKeys: - E = get_count(k, expected) - O = get_count(k, actual) + E = _get_count(k, expected) + O = _get_count(k, actual) if E == 0: print(' Warning! Expected 0 counts of {}, but got {}'.format(k, O)) elif O == 0: @@ -49,17 +50,39 @@ def chi_value(actual, expected): degreesFreedom = len(answerKeys) chiSquared = 0 - get_count = lambda k, d : d[k]['count'] if k in d else 0 - for k in answerKeys: - E = get_count(k, expected) - O = get_count(k, actual) + E = _get_count(k, expected) + O = _get_count(k, actual) if E == 0: print(' Warning! Expected 0 counts of {}, but got {}'.format(k, O)) else: chiSquared += (O - E) ** 2 / E return degreesFreedom, chiSquared +def probability_difference(actual, expected): + actualC = 0 + expectedC = 0 + + for k in set(list(actual.keys()) + list(expected.keys())): + expectedC += _get_count(k, expected) + actualC += _get_count(k, actual) + + p = 0 + + Et = 0 + Ot = 0 + + for k in set(list(actual.keys()) + list(expected.keys())): + E = _get_count(k, expected) + O = _get_count(k, actual) + Ep = E / expectedC + Op = O / actualC + p += abs(Ep - Op) + + p /= 2 # P is between 0 and 2 -> P is between 0 and 1 + + return p + def dist_test(actual, expected, calculation): df, p = calculation(actual, expected) if df not in _ptable: @@ -67,14 +90,17 @@ def dist_test(actual, expected, calculation): ' Please look up the value and add it to the table in copycat/statistics.py'.format(df)) return (p < _ptable[df]) -def cross_formula_table(actualDict, expectedDict, calculation): +def cross_formula_table(actualDict, expectedDict, calculation, probs=False): data = dict() for ka, actual in actualDict.items(): for ke, expected in expectedDict.items(): - data[(ka, ke)] = dist_test(actual, expected, calculation) + if probs: + data[(ka, ke)] = probability_difference(actual, expected) + else: + data[(ka, ke)] = dist_test(actual, expected, calculation) return data -def cross_table(problemSets, calculation=g_value): +def cross_table(problemSets, calculation=g_value, probs=False): table = defaultdict(dict) for i, (a, problemSetA) in enumerate(problemSets): for b, problemSetB in problemSets[i + 1:]: @@ -89,7 +115,7 @@ def cross_table(problemSets, calculation=g_value): problemA.modified, problemA.target)][(a, b)] = ( cross_formula_table( - answersA, answersB, calculation)) + answersA, answersB, calculation, probs)) return table def iso_chi_squared(actualDict, expectedDict): diff --git a/cross_compare.py b/cross_compare.py index 1671ac5..ed36bc2 100755 --- a/cross_compare.py +++ b/cross_compare.py @@ -19,7 +19,7 @@ def main(args): pSet = pickle.load(infile) branchProblemSets[filename] = pSet problemSets.append((filename, pSet)) - crossTable = cross_table(problemSets) + crossTable = cross_table(problemSets, probs=True) key_sorted_items = lambda d : sorted(d.items(), key=lambda t:t[0]) tableItems = key_sorted_items(crossTable)