diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 6b2181b..5ef2450 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -182,21 +182,13 @@ def ipfn_df(self, df, aggregates, dimensions, weight_col='total'): print(df) print(df.groupby('age')['total'].sum(), xip)""" - steps = len(aggregates) - tables = [df] - for inc in range(steps - 1): - tables.append(df.copy()) - original = df.copy() - # Calculate the new weights for each dimension inc = 0 + + table_current = df.copy() + for features in dimensions: - if inc == (steps - 1): - table_update = df - table_current = tables[inc].copy() - else: - table_update = tables[inc + 1] - table_current = tables[inc] + table_update = table_current.copy() tmp = table_current.groupby(features)[weight_col].sum() xijk = aggregates[inc] @@ -234,8 +226,10 @@ def ipfn_df(self, df, aggregates, dimensions, weight_col='total'): table_update.reset_index(inplace=True) table_current.reset_index(inplace=True) + + table_current = table_update + inc += 1 - feat_l = [] # Calculate the max convergence rate max_conv = 0 diff --git a/tests/tests.py b/tests/tests.py index 0862937..93071c3 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -286,3 +286,37 @@ def test_pandas_3D(self): for feature in features: assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2) m_inc += 1 + + def test_pandas_3D_with_zeros(self): + categories = { + "A": [3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2], + "B": [1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2], + "C": [1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1], + } + seed_total = [ + 0, 0, 0, 3., + 5., 0, 6., 2., + 2., 1., 0, 2., + 5., 0, 0, 0, + ] + + true_total = [ + 0, 0, 0, 3., + 6., 0, 3., 3., + 2., 10., 0, 7., + 9., 0, 0, 0, + ] + + df_seed = pd.DataFrame(categories | {'total': seed_total}) + df_true = pd.DataFrame(categories | {'total': true_total}) + + dimensions = [['A'], ['B'], ['C'], ['A', 'B'], ['A', 'C'], ['B', 'C']] + aggregates = [df_true.groupby(dimension).sum().total for dimension in dimensions] + + IPF = ipfn.ipfn(df_seed, aggregates, dimensions, convergence_rate=1e-8, rate_tolerance=1e-9) + + df_fitted = IPF.iteration() + df_verify = df_fitted.merge(df_true, on=list(categories.keys())) + + # All fitted values should match known margins within convergence rate + assert ((df_verify.total_x - df_verify.total_y).abs() < 1e-7).all() \ No newline at end of file