diff --git a/PCAxis.Serializers/Parquet/ParquetBuilder.cs b/PCAxis.Serializers/Parquet/ParquetBuilder.cs index 4d847a7..605251e 100644 --- a/PCAxis.Serializers/Parquet/ParquetBuilder.cs +++ b/PCAxis.Serializers/Parquet/ParquetBuilder.cs @@ -58,7 +58,12 @@ public Table PopulateTable() int matrixSize = model.Data.MatrixColumnCount * model.Data.MatrixRowCount; double[] data = new double[matrixSize]; int[] variableValueCounts = GetVariableValueCounts(); - var indices = GenerateDataPointIndices(variableValueCounts); + // Build a mask indicating which variables are multi-valued content variables. + bool[] isContentMulti = model.Meta.Variables + .Select(v => v.IsContentVariable && v.Values.Count > 1) + .ToArray(); + + var indices = GenerateDataPointIndices(variableValueCounts, isContentMulti); for (int m = 0; m < matrixSize; m++) { @@ -213,23 +218,26 @@ private void PopulateContentVariableRow(int[] index, int[] variableValueCounts, int columnIndex = dataFieldIndices[columnName]; int symbolColumnIndex = dataFieldIndices[symbolColumnName]; // Get index of the symbol column - int dataIndex = ParquetBuilder.GetDataIndex(index, variableValueCounts); - if (dataIndex + j < data.Length) - { - dataIndex += j; - } + // Compute the exact data index for this content value by cloning the + // multi-dimensional index, setting the content variable coordinate to j, + // and converting to the linear data index. This avoids assumptions about + // dimension ordering or stride between content values. + int varPos = model.Meta.Variables.IndexOf(variable); + int[] indexForValue = (int[])index.Clone(); + indexForValue[varPos] = j; + int dataIndexForValue = ParquetBuilder.GetDataIndex(indexForValue, variableValueCounts); - if (dataIndex >= 0 && dataIndex < data.Length) + if (dataIndexForValue >= 0 && dataIndexForValue < data.Length) { - if (dataSymbolMap.ContainsKey(data[dataIndex])) + if (dataSymbolMap.ContainsKey(data[dataIndexForValue])) { - row[symbolColumnIndex] = dataSymbolMap[data[dataIndex]]; // Set the symbol value + row[symbolColumnIndex] = dataSymbolMap[data[dataIndexForValue]]; // Set the symbol value row[columnIndex] = double.NaN; // Replace the value with double.NaN } else { - row[columnIndex] = data[dataIndex]; + row[columnIndex] = data[dataIndexForValue]; row[symbolColumnIndex] = null; // No symbol } } @@ -248,11 +256,17 @@ private void PopulateContentVariableRow(int[] index, int[] variableValueCounts, private void PopulateNonContentVariableRow(int[] index, object[] row, Dictionary dataFieldIndices, Variable variable, int i) { var value = variable.Values[index[i]].Code; + if (variable.IsTime) { - value = variable.Values[index[i]].TimeValue; - row[dataFieldIndices[variable.Name]] = value; // Original time-value - row[dataFieldIndices["timestamp"]] = ParseTimeScale(value, variable.TimeScale); // Parsed timestamp + // We can't look at TimeValue because in Paxiom TimeValue is wrong + // when the codes and values are not sorted ascending + if (variable.Values.IsCodesFictional) + { + value = variable.Values[index[i]].Value; + } + row[dataFieldIndices[variable.Name]] = value; + row[dataFieldIndices["timestamp"]] = ParseTimeScale(value, variable.TimeScale); } else { @@ -327,6 +341,7 @@ private static DateTime ParseAnnual(string value) /// private static DateTime ParseHalfyear(string value) { + value = new string(value.Where(c => char.IsDigit(c)).ToArray()); if (int.TryParse(value.Substring(0, 4), out int halfYearYear) && int.TryParse(value.Substring(4), out int halfYear)) { int monthHalfyear = halfYear == 1 ? 1 : 7; @@ -344,6 +359,7 @@ private static DateTime ParseHalfyear(string value) /// private static DateTime ParseQuarterly(string value) { + value = new string(value.Where(c => char.IsDigit(c)).ToArray()); if (int.TryParse(value.Substring(0, 4), out int quarterYear) && int.TryParse(value.Substring(4), out int quarter)) { int monthQuarter = (quarter - 1) * 3 + 1; @@ -360,6 +376,7 @@ private static DateTime ParseQuarterly(string value) /// private static DateTime ParseMonthly(string value) { + value = new string(value.Where(c => char.IsDigit(c)).ToArray()); if (int.TryParse(value.Substring(0, 4), out int monthYear) && int.TryParse(value.Substring(4, 2), out int month)) { return new DateTime(monthYear, month, 1, 0, 0, 0, DateTimeKind.Utc); @@ -375,6 +392,7 @@ private static DateTime ParseMonthly(string value) /// private static DateTime ParseWeekly(string value) { + value = new string(value.Where(c => char.IsDigit(c)).ToArray()); if (int.TryParse(value.Substring(0, 4), out int weekYear) && int.TryParse(value.Substring(4), out int week)) { DateTime jan1 = new DateTime(weekYear, 1, 1, 0, 0, 0, DateTimeKind.Utc); @@ -412,11 +430,12 @@ static int GetDataIndex(int[] index, int[] variableValueCounts) for (int i = index.Length - 1; i >= 0; i--) { - dataIndex += index[i] * multiplier; - if (i < variableValueCounts.Length - 1) // Adjusting the condition here + if (i < variableValueCounts.Length - 1) // Ensure multiplier equals product of dimensions to the right { multiplier *= variableValueCounts[i + 1]; } + + dataIndex += index[i] * multiplier; } return dataIndex; @@ -473,16 +492,22 @@ private static PXModel RearrangeValues(PXModel model) /// /// An array of integers representing the counts of values for each variable. /// A list of integer arrays representing the data point indices. - private static List GenerateDataPointIndices(int[] variableValueCounts) + private static List GenerateDataPointIndices(int[] variableValueCounts, bool[] isContentMulti) { int variableCount = variableValueCounts.Length; - int[] variableIndexCounts = new int[variableCount]; + + // Effective counts treat multi-valued content variables as having count 1 + // so that rows correspond to combinations of the other variables only. + int[] effectiveCounts = new int[variableCount]; + for (int i = 0; i < variableCount; i++) + { + effectiveCounts[i] = (isContentMulti != null && i < isContentMulti.Length && isContentMulti[i]) ? 1 : variableValueCounts[i]; + } int totalDataPoints = 1; for (int i = variableCount - 1; i >= 0; i--) { - variableIndexCounts[i] = totalDataPoints; - totalDataPoints *= variableValueCounts[i]; + totalDataPoints *= effectiveCounts[i]; } List dataPointIndices = new List(totalDataPoints); @@ -494,12 +519,14 @@ private static List GenerateDataPointIndices(int[] variableValueCounts) int tempIndex = dataIndex; for (int variableIndex = 0; variableIndex < variableCount; variableIndex++) { - indices[variableIndex] = tempIndex % variableValueCounts[variableIndex]; - tempIndex /= variableValueCounts[variableIndex]; + // For multi-valued content variables effectiveCounts will be 1 so this yields 0. + indices[variableIndex] = tempIndex % effectiveCounts[variableIndex]; + tempIndex /= effectiveCounts[variableIndex]; } dataPointIndices.Add(indices); } + return dataPointIndices; } } diff --git a/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs b/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs index df6f488..442f3e7 100644 --- a/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs +++ b/UnitTests/Parquet/ParquetSerializationIntegrationTests.cs @@ -44,8 +44,12 @@ public void ShouldSerializePxModel(string pxFile) // Sync wrapper around async call Table table = ReadBackParquetFileSync(outputFile); - // Assertion: Ensure that the model's matrix size is equal to the table's count. - Assert.AreEqual(table.Count, model.Data.MatrixSize, $"Mismatch in matrix size for file {fileNameWithoutExtension}.parquet."); + // Assertion: Ensure that the table's row count equals the number of observations + // for a single ContentsCode. If the model has multiple contents, the serializer + // emits additional content columns rather than duplicating rows. + int contentCount = model.Meta.ContentVariable != null ? model.Meta.ContentVariable.Values.Count : 1; + int expectedRows = model.Data.MatrixSize / contentCount; + Assert.AreEqual(expectedRows, table.Count, $"Mismatch in matrix size for file {fileNameWithoutExtension}.parquet."); // Assertion: Calculate the amount of columns we should have, based on the metadata // Number of columns in meta, number of columns in table. @@ -56,6 +60,30 @@ public void ShouldSerializePxModel(string pxFile) Assert.AreEqual(numberOfColsInParq, numberOfColsInPx, $"Mismatch in column number for {fileNameWithoutExtension}.parquet."); } + [TestMethod, Description("Tests correct ordering of time variable (pxfile: 16216.px)")] + [DeploymentItem("TestFiles/14216.px")] + public void TestTimeVariableOrdering() + { + var pxFile = "14216.px"; + var model = GetPxModelFromFile(pxFile); + string fileNameWithoutExtension = Path.GetFileNameWithoutExtension(pxFile); + string outputFile = Path.Combine(OutputDirectoryPath, $"{fileNameWithoutExtension}.parquet"); + SerializePxModelToParquet(model, outputFile); + Table table = ReadBackParquetFileSync(outputFile); + + Assert.AreEqual(2, table.Count, "Test number of rows"); + + Assert.AreEqual("0801", table[0].Values[0], "Test tettsted"); + Assert.AreEqual("2025", table[0].Values[1], "Test year"); + Assert.AreEqual(275.87, table[0].Values[3], "Test ContentsCode_Areal"); + Assert.AreEqual(double.Parse("1110887"), table[0].Values[5], "Test ContentsCode_Bosatte"); + + Assert.AreEqual("0801", table[1].Values[0], "Test tettsted"); + Assert.AreEqual("2024", table[1].Values[1], "Test year"); + Assert.AreEqual(276.30, table[1].Values[3], "Test ContentsCode_Areal"); + Assert.AreEqual(double.Parse("1098061"), table[1].Values[5], "Test ContentsCode_Bosatte"); + } + private static int CalculateNumberOfColumnsFromPxFile(PXModel model) { diff --git a/UnitTests/TestFiles/14216.px b/UnitTests/TestFiles/14216.px new file mode 100644 index 0000000..ded6448 --- /dev/null +++ b/UnitTests/TestFiles/14216.px @@ -0,0 +1,62 @@ +CHARSET="ANSI"; +AXIS-VERSION="2010"; +CODEPAGE="iso-8859-1"; +LANGUAGE="no"; +CREATION-DATE="20260223 22:57"; +DECIMALS=2; +SHOWDECIMALS=0; +MATRIX="14216"; +COPYRIGHT=NO; +SUBJECT-CODE="be"; +SUBJECT-AREA="Befolkning"; +TITLE="14216: Areal og befolkning i tettsteder, etter tettsted, statistikkvariabel og år"; +CONTENTS="14216: Areal og befolkning i tettsteder,"; +STUB="tettsted"; +HEADING="statistikkvariabel","år"; +CONTVARIABLE="statistikkvariabel"; +VARIABLECODE("tettsted")="TettSted"; +VALUES("tettsted")="Oslo"; +VARIABLECODE("statistikkvariabel")="ContentsCode"; +VALUES("statistikkvariabel")="Areal av tettsted (km²)","Bosatte"; +VARIABLECODE("år")="Tid"; +VALUES("år")="2025","2024"; +TIMEVAL("år")=TLIST(A1),"2024","2025"; +CODES("tettsted")="0801"; +CODES("statistikkvariabel")="Areal","Bosatte"; +CODES("år")="2025","2024"; +PRESTEXT("tettsted")=2; +PRESTEXT("år")=0; +PRECISION("statistikkvariabel","Areal av tettsted (km²)")=2; +UNITS="km²"; +LAST-UPDATED("Areal av tettsted (km²)")="20251027 08:00"; +STOCKFA("Areal av tettsted (km²)")="S"; +DAYADJ("Areal av tettsted (km²)")=NO; +SEASADJ("Areal av tettsted (km²)")=NO; +REFPERIOD("Areal av tettsted (km²)")="01.01"; +UNITS("Areal av tettsted (km²)")="km²"; +CONTACT("Areal av tettsted (km²)")="Bjørn Lie Rapp, Statistisk sentralbyrå# +47 47 97 17 27#rnl@ssb.no##Vilni Verner Holst Bloch, Statistisk sentralbyrå# +47 99 85 23 42#vvh@ssb.no##"; +LAST-UPDATED("Bosatte")="20251027 08:00"; +STOCKFA("Bosatte")="S"; +DAYADJ("Bosatte")=NO; +SEASADJ("Bosatte")=NO; +REFPERIOD("Bosatte")="01.01"; +UNITS("Bosatte")="personer"; +CONTACT("Bosatte")="Bjørn Lie Rapp, Statistisk sentralbyrå# +47 47 97 17 27#rnl@ssb.no##Vilni Verner Holst Bloch, Statistisk sentralbyrå# +47 99 85 23 42#vvh@ssb.no##"; +DATABASE="Ekstern PROD database O_STATMETA_24 som 2.4"; +SOURCE="Statistisk sentralbyrå"; +INFOFILE="None"; +NOTE="Ikke medregnet personer uten opplysninger om bostedstrøk."; +META-ID="KORTNAVN:beftett"; +META-ID("tettsted")="urn:ssb:classification:klass:110,urn:ssb:conceptvariable:vardok:141"; +DATASYMBOL1=".."; +DATASYMBOL2="..."; +DATASYMBOL3=":"; +DATASYMBOLSUM="."; +DATASYMBOLNIL="-"; +DATANOTESUM="."; +TABLEID="14216"; +VARIABLE-TYPE("tettsted")="V"; +VARIABLE-TYPE("år")="T"; +DATA= +275.87 276.30 1110887.00 1098061.00 +; \ No newline at end of file