From bbcf165bacf6ece65157b49738c38a7f27b54510 Mon Sep 17 00:00:00 2001
From: rtomek <rtomek@uchicago.edu>
Date: Thu, 1 May 2025 16:50:26 -0500
Subject: [PATCH 1/7] Add context menu to copyabletableview

---
 docs/source/modules.rst                       |  6 +-
 jsdconfig-zipcode.yaml                        | 64 +++++++++++---
 .../gui/pyside6/copyabletableview.py          | 22 ++++-
 src/midrc_react/gui/pyside6/jsdview.py        | 87 +++++++++++++++----
 src/midrc_react/plugins/midrc_tsv_loader.py   |  2 +-
 5 files changed, 144 insertions(+), 37 deletions(-)

diff --git a/docs/source/modules.rst b/docs/source/modules.rst
index ba39b8e..952175e 100644
--- a/docs/source/modules.rst
+++ b/docs/source/modules.rst
@@ -1,7 +1,7 @@
-midrc_react
-===========
+src
+===
 
 .. toctree::
-   :maxdepth: 8
+   :maxdepth: 7
 
    midrc_react
diff --git a/jsdconfig-zipcode.yaml b/jsdconfig-zipcode.yaml
index 0f3ee9d..99d8133 100644
--- a/jsdconfig-zipcode.yaml
+++ b/jsdconfig-zipcode.yaml
@@ -1,10 +1,22 @@
 data sources:
   # The data sources will be loaded in the order they are populated here
   - name: MIDRC
-    description: MIDRC Excel File
+    description: MIDRC TSV File
     data type: file
-    filename: data/MIDRC Open A1 and R1 - cumulative by batch.xlsx
-    remove column name text: [(CUSUM)]
+    filename: data/midrc_data_download-2025-01-29.tsv
+    columns:
+      - Age at Index
+      - Ethnicity
+      - Race
+      - Sex
+      - COVID-19 Positive
+      - Race and Ethnicity
+    numeric_cols:
+      Age at Index:
+        raw column: age_at_index
+        bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ]
+        labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ]
+    plugin: midrc_tsv_loader
 
   - name: CDC
     description: CDC Excel File
@@ -18,13 +30,25 @@ data sources:
     date: '2020-01-01' # The census file does not have a date column, so we specify the date here
 
   - name: MIDRC COVID+
-    description: MIDRC COVID+ Excel File
+    description: MIDRC COVID+ TSV File
     data type: file
-    filename: data/MIDRC Open A1 and R1 COVIDpos only - cumulative by batch.xlsx
-    remove column name text: [(CUSUM)]
+    filename: data/midrc_data_download-2025-01-29_covid_pos.tsv
+    columns:
+      - Age at Index
+      - Ethnicity
+      - Race
+      - Sex
+      - COVID-19 Positive
+      - Race and Ethnicity
+    numeric_cols:
+      Age at Index:
+        raw column: age_at_index
+        bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ]
+        labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ]
+    plugin: midrc_tsv_loader
 
   - name: Zip Code 1
-    description: Zip Code 1 Excel File
+    description: Zip Code 1 TSV File
     data type: file
     filename: data/midrc_data_download-2025-01-29_0.tsv
     columns:
@@ -32,17 +56,17 @@ data sources:
       - Ethnicity
       - Race
       - Sex
-      - Covid19 Positive
+      - COVID-19 Positive
       - Race and Ethnicity
     numeric_cols:
       Age at Index:
         raw column: age_at_index
-        bins: [ 0, 17, 50, 65, 1000 ]
-        labels: ['0-17', "18-49", '50-64', '65+']
+        bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ]
+        labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ]
     plugin: midrc_tsv_loader
 
   - name: Zip Code 2
-    description: Zip Code 2 Excel File
+    description: Zip Code 2 TSV File
     data type: file
     filename: data/midrc_data_download-2025-01-29_1.tsv
     columns:
@@ -50,15 +74,27 @@ data sources:
       - Ethnicity
       - Race
       - Sex
-      - Covid19 Positive
+      - COVID-19 Positive
       - Race and Ethnicity
     numeric_cols:
       Age at Index:
         raw column: age_at_index
-        bins: [ 0, 17, 50, 65, 1000 ]
-        labels: ['0-17', "18-49", '50-64', '65+']
+        bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ]
+        labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ]
     plugin: midrc_tsv_loader
 
+  - name: MIDRC COVID+
+    description: MIDRC COVID+ Excel File
+    data type: file
+    filename: data/MIDRC Open A1 and R1 COVIDpos only - cumulative by batch.xlsx
+    remove column name text: [(CUSUM)]
+
+  - name: MIDRC
+    description: MIDRC Excel File
+    data type: file
+    filename: data/MIDRC Open A1 and R1 - cumulative by batch.xlsx
+    remove column name text: [(CUSUM)]
+
 # TODO: The following should be moved into QSettings for modifications within the GUI
 # For custom age columns, please use .inf as the maximum age in the final age group
 custom age ranges:
diff --git a/src/midrc_react/gui/pyside6/copyabletableview.py b/src/midrc_react/gui/pyside6/copyabletableview.py
index f3cb80e..9472871 100644
--- a/src/midrc_react/gui/pyside6/copyabletableview.py
+++ b/src/midrc_react/gui/pyside6/copyabletableview.py
@@ -23,7 +23,7 @@
 
 from PySide6.QtCore import QDate, QEvent, QObject, Qt
 from PySide6.QtGui import QGuiApplication, QKeySequence
-from PySide6.QtWidgets import QTableView
+from PySide6.QtWidgets import QTableView, QMenu
 
 
 class CopyableTableView(QTableView):
@@ -78,3 +78,23 @@ def copy_selection(self) -> None:
             stream = io.StringIO()
             csv.writer(stream, delimiter='\t').writerows(table)
             QGuiApplication.clipboard().setText(stream.getvalue())
+
+    def contextMenuEvent(self, event) -> None:
+        """
+        Create a context menu with 'Select All' and 'Copy' options on right-click.
+
+        Args:
+            event (QContextMenuEvent): The context menu event.
+
+        Returns:
+            None
+        """
+        menu = QMenu(self)
+        select_all_action = menu.addAction("Select All")
+        copy_action = menu.addAction("Copy")
+
+        action = menu.exec(event.globalPos())
+        if action == select_all_action:
+            self.selectAll()
+        elif action == copy_action:
+            self.copy_selection()
diff --git a/src/midrc_react/gui/pyside6/jsdview.py b/src/midrc_react/gui/pyside6/jsdview.py
index 473029f..4c22096 100644
--- a/src/midrc_react/gui/pyside6/jsdview.py
+++ b/src/midrc_react/gui/pyside6/jsdview.py
@@ -455,7 +455,7 @@ def update_spider_chart_title(self, file1_data: str = 'File 1', file2_data: str
 
     def update_area_chart(self, category: Dict[Any, Any]) -> bool:
         """
-        Update the area chart with new data from the provided sheets.
+        Update the area chart with new data from the provided sheets, ensuring the series follow a common order.
 
         Args:
             category (dict): A dictionary where each key maps to a sheet containing chart data.
@@ -466,36 +466,80 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool:
         category_str: str = self._dataselectiongroupbox.category_combobox.currentText()
         clear_layout(self.area_chart_widget.layout())
 
+        # Compute the global minimum and maximum dates across all sheets
+        global_min = None
+        global_max = None
+        for sheets in category.values():
+            cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str
+            if cat in sheets:
+                df = sheets[cat].df
+                dates_list = [
+                    QDateTime(numpy_datetime64_to_qdate(date), QTime())
+                    for date in df.date.values
+                ]
+                if dates_list:
+                    local_min = min(dates_list, key=lambda d: d.toMSecsSinceEpoch())
+                    local_max = max(dates_list, key=lambda d: d.toMSecsSinceEpoch())
+                    if global_min is None or local_min < global_min:
+                        global_min = local_min
+                    if global_max is None or local_max > global_max:
+                        global_max = local_max
+
+        # If no dates found, return early.
+        if global_min is None or global_max is None:
+            return False
+
+        # Compute the union order (common order) for the category across all sheets.
+        common_order: List[str] = []
+        for sheets in category.values():
+            cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str
+            if cat in sheets:
+                for col in sheets[cat].data_columns:
+                    if col not in common_order:
+                        common_order.append(col)
+
         for index, sheets in category.items():
+            cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str
             area_chart: QChart = QChart()
             filename: str = self.dataselectiongroupbox.file_comboboxes[index].currentData()
-            area_chart.setTitle(f'{filename} {category_str} distribution over time')
+            area_chart.setTitle(f"{filename} {cat} distribution over time")
 
-            if category_str.endswith(' (ks2)'):
-                category_str = category_str[:-6]
-            if category_str not in sheets:
+            if cat not in sheets:
                 continue
 
-            df = sheets[category_str].df
-            cols_to_use = sheets[category_str].data_columns
-            dates: List[QDateTime] = [QDateTime(numpy_datetime64_to_qdate(date), QTime()) for date in df.date.values]
-
-            JsdWindow._add_area_chart_series(area_chart, df, cols_to_use, dates)
-            JsdWindow._attach_axes_to_area_chart(area_chart, dates)
+            df = sheets[cat].df
+            sheet_order = sheets[cat].data_columns
+            # Compute final order with common ordering
+            final_order = [col for col in common_order if col in sheet_order]
+            final_order += [col for col in sheet_order if col not in final_order]
+            if "Not Reported" in final_order:
+                final_order = [col for col in final_order if col != "Not Reported"] + ["Not Reported"]
+
+            dates: List[QDateTime] = [
+                QDateTime(numpy_datetime64_to_qdate(date), QTime())
+                for date in df.date.values
+            ]
+            # Call the modified helper with the global max date.
+            JsdWindow._add_area_chart_series(area_chart, df, final_order, dates, global_max)
+            # Attach axes using global min and max so that x-axis is consistent.
+            JsdWindow._attach_axes_to_area_chart(area_chart, global_min, global_max)
             self.add_area_chart_view(area_chart)
 
         return True
 
     @staticmethod
-    def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], dates: List[QDateTime]) -> None:
+    def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str],
+                               dates: List[QDateTime], global_max: QDateTime) -> None:
         """
         Add series to an area chart based on provided data.
+        If the last date is before global_max, an additional data point is appended with the same series value.
 
         Args:
             area_chart (QChart): The chart to update.
             df (DataFrame): Data source for the series.
             cols_to_use (List[str]): List of columns to plot.
             dates (List[QDateTime]): X-axis dates for the chart.
+            global_max (QDateTime): The global maximum date.
 
         Returns:
             None
@@ -504,6 +548,7 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str],
         total_counts = df_cols.sum(axis=1)
         cumulative_percents = 100.0 * df_cols.cumsum(axis=1).div(total_counts, axis=0)
         lower_series = None
+        global_max_msecs = global_max.toMSecsSinceEpoch()
 
         for col in cols_to_use:
             if df_cols[col].iloc[-1] == 0:
@@ -512,8 +557,12 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str],
                 QPointF(dates[i].toMSecsSinceEpoch(), cumulative_percents.iloc[i][col])
                 for i in range(len(dates))
             ]
-            if len(dates) == 1:
-                points.append(QPointF(dates[0].toMSecsSinceEpoch() + 1, cumulative_percents.iloc[0][col]))
+            # If the last data point is before global_max, add an extra point.
+            if points and points[-1].x() < global_max_msecs:
+                points.append(QPointF(global_max_msecs, points[-1].y()))
+            # In case only one date exists, also add a second point slightly offset if needed.
+            if len(points) == 1:
+                points.append(QPointF(points[0].x() + 1, points[0].y()))
             upper_series: QLineSeries = QLineSeries(area_chart)
             upper_series.append(points)
             area_series: QAreaSeries = QAreaSeries(upper_series, lower_series)
@@ -522,13 +571,15 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str],
             lower_series = upper_series
 
     @staticmethod
-    def _attach_axes_to_area_chart(area_chart: QChart, dates: List[QDateTime]) -> None:
+    def _attach_axes_to_area_chart(area_chart: QChart, global_min: QDateTime,
+                               global_max: QDateTime) -> None:
         """
-        Attach X and Y axes to an area chart.
+        Attach X and Y axes to an area chart using global min and max dates for the X-axis.
 
         Args:
             area_chart (QChart): The chart to attach axes to.
-            dates (List[QDateTime]): List of dates for the X-axis.
+            global_min (QDateTime): The global minimum date.
+            global_max (QDateTime): The global maximum date.
 
         Returns:
             None
@@ -537,7 +588,7 @@ def _attach_axes_to_area_chart(area_chart: QChart, dates: List[QDateTime]) -> No
         axis_x.setTickCount(10)
         axis_x.setFormat("MMM yyyy")
         axis_x.setTitleText("Date")
-        axis_x.setRange(dates[0], dates[-1] if len(dates) > 1 else dates[0].addMSecs(1))
+        axis_x.setRange(global_min, global_max)
         area_chart.addAxis(axis_x, Qt.AlignBottom)
 
         axis_y: QValueAxis = QValueAxis()
diff --git a/src/midrc_react/plugins/midrc_tsv_loader.py b/src/midrc_react/plugins/midrc_tsv_loader.py
index dee9af0..5c17c82 100644
--- a/src/midrc_react/plugins/midrc_tsv_loader.py
+++ b/src/midrc_react/plugins/midrc_tsv_loader.py
@@ -87,7 +87,7 @@ def classify(row):
 
         if ethnicity == 'Hispanic or Latino':
             return ethnicity
-        if race == 'Not Reported' or ethnicity == 'Not Reported' or pd.isna(race) or pd.isna(ethnicity):
+        if race == 'Not Reported' or ethnicity == 'Not Reported': # or pd.isna(race) or pd.isna(ethnicity):
             return 'Not Reported'
         return f'{race}, {ethnicity}'
 

From 7d246494a3a90a0f2db12c6854bde6d78891903e Mon Sep 17 00:00:00 2001
From: Robert Tomek <robt_0@hotmail.com>
Date: Thu, 22 May 2025 15:00:45 -0500
Subject: [PATCH 2/7] fix potential issues with data types

---
 src/midrc_react/core/data_preprocessing.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/midrc_react/core/data_preprocessing.py b/src/midrc_react/core/data_preprocessing.py
index f22d9f7..a5b61a3 100644
--- a/src/midrc_react/core/data_preprocessing.py
+++ b/src/midrc_react/core/data_preprocessing.py
@@ -60,8 +60,12 @@ def _adjust_outliers(df: pd.DataFrame, cut_column_name: str, column_name: str, b
     low_text = "Outlier_Low"
     high_text = "Outlier_High"
     print(f"WARNING: There are values outside the bins specified for the '{column_name}' column.")
-    df.loc[df[cut_column_name].isna() & (df[column_name] < bins[0]), cut_column_name] = low_text
-    df.loc[df[cut_column_name].isna() & (df[column_name] >= bins[-1]), cut_column_name] = high_text
+
+    # Only compare numeric values, ignore strings or other types
+    col_numeric = pd.to_numeric(df[column_name], errors='coerce')
+
+    df.loc[df[cut_column_name].isna() & (col_numeric < bins[0]), cut_column_name] = low_text
+    df.loc[df[cut_column_name].isna() & (col_numeric >= bins[-1]), cut_column_name] = high_text
     df.loc[df[cut_column_name].isna(), cut_column_name] = new_text
     if (df[cut_column_name] == low_text).sum() > 0:
         print(f"         {(df[cut_column_name] == low_text).sum()} values are below the min bin value.\n"

From 9cb23646b407e81aacb298d215e5719aabbf5a56 Mon Sep 17 00:00:00 2001
From: Robert Tomek <robt_0@hotmail.com>
Date: Thu, 22 May 2025 16:12:04 -0500
Subject: [PATCH 3/7] fix potential bug with dataset column

---
 src/midrc_react/core/famd_calc.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/midrc_react/core/famd_calc.py b/src/midrc_react/core/famd_calc.py
index 19fe6e4..5468463 100644
--- a/src/midrc_react/core/famd_calc.py
+++ b/src/midrc_react/core/famd_calc.py
@@ -165,7 +165,11 @@ def calc_famd_distances(df, cols_to_use, numeric_cols, dataset_column='_dataset_
         dict: Dictionary of distance values specified in distance_metrics for each dataset combination.
 
     """
-    return calc_distances_via_df(calc_famd_df(df, cols_to_use, numeric_cols, print_outliers=print_outliers),
+    return calc_distances_via_df(calc_famd_df(df,
+                                              cols_to_use,
+                                              numeric_cols,
+                                              dataset_column,
+                                              print_outliers=print_outliers),
                                  'famd_x_coordinates',
                                  dataset_column,
                                  distance_metrics=distance_metrics,

From f228c5e14ffe79200b160f478a8db11d1a6794e2 Mon Sep 17 00:00:00 2001
From: Robert Tomek <robt_0@hotmail.com>
Date: Thu, 22 May 2025 18:04:22 -0500
Subject: [PATCH 4/7] Small tweaks to how the dataset column is handled for
 consistency

---
 src/midrc_react/core/aggregate_jsd_calc.py | 32 ++++++++++++----------
 src/midrc_react/core/famd_calc.py          | 13 ++++-----
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/midrc_react/core/aggregate_jsd_calc.py b/src/midrc_react/core/aggregate_jsd_calc.py
index 5a9168e..e8c1a9c 100644
--- a/src/midrc_react/core/aggregate_jsd_calc.py
+++ b/src/midrc_react/core/aggregate_jsd_calc.py
@@ -62,6 +62,22 @@ def calc_jsd_from_counts_dict(counts_dict, dataset_names):
 
     return output_dict
 
+def calc_jsd_by_features_combined(combined_df: pd.DataFrame, cols_to_use: list[str], dataset_column) -> dict[str, float]:
+    # Pivot table to get counts for each combination
+    pivot_table = combined_df.pivot_table(index=cols_to_use, columns=dataset_column, aggfunc='size', fill_value=0)
+    pivot_table = pivot_table.reset_index()
+
+    # Convert dataset columns to string in case they are integers
+    pivot_table.columns = pivot_table.columns.astype(str)
+
+    labels = combined_df[dataset_column].unique().astype(str)
+
+    # Create a dictionary to hold counts for each dataset
+    counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for
+                   dataset in labels}
+
+    return calc_jsd_from_counts_dict(counts_dict, labels)
+
 
 def calc_jsd_by_features(df_list: list[pd.DataFrame], cols_to_use: list[str]) -> dict[str, float]:
     """
@@ -76,21 +92,7 @@ def calc_jsd_by_features(df_list: list[pd.DataFrame], cols_to_use: list[str]) ->
     """
     dataset_column = '_dataset_'  # Temporary column name to store dataset information
     combined_df = combine_datasets_from_list(df_list, dataset_column)
-
-    # Pivot table to get counts for each combination
-    pivot_table = combined_df.pivot_table(index=cols_to_use, columns=dataset_column, aggfunc='size', fill_value=0)
-    pivot_table = pivot_table.reset_index()
-
-    # Convert dataset columns to string in case they are integers
-    pivot_table.columns = pivot_table.columns.astype(str)
-
-    labels = combined_df[dataset_column].unique()
-
-    # Create a dictionary to hold counts for each dataset
-    counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for
-                   dataset in labels}
-
-    return calc_jsd_from_counts_dict(counts_dict, labels)
+    return calc_jsd_by_features_combined(combined_df, cols_to_use, dataset_column)
 
 
 def calc_jsd_by_features_2df(df1: pd.DataFrame, df2: pd.DataFrame, cols_to_use: list[str]) -> float:
diff --git a/src/midrc_react/core/famd_calc.py b/src/midrc_react/core/famd_calc.py
index 5468463..5889857 100644
--- a/src/midrc_react/core/famd_calc.py
+++ b/src/midrc_react/core/famd_calc.py
@@ -169,15 +169,16 @@ def calc_famd_distances(df, cols_to_use, numeric_cols, dataset_column='_dataset_
                                               cols_to_use,
                                               numeric_cols,
                                               dataset_column,
-                                              print_outliers=print_outliers),
+                                              print_outliers=print_outliers
+                                              ),
                                  'famd_x_coordinates',
-                                 dataset_column,
+                                 dataset_column=dataset_column,
                                  distance_metrics=distance_metrics,
                                  jsd_scaled_bin_width=jsd_scaled_bin_width,
                                  )
 
 
-def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date):
+def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date, dataset_column='_dataset_'):
     """
     Calculate the KS2 distance between two datasets at a specific date.
 
@@ -194,7 +195,6 @@ def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date):
     df1_at_date = df1[df1['date'] <= calc_date]
     df2_at_date = df2[df2['date'] <= calc_date]
 
-    dataset_column = '_dataset_'
     combined_df = combine_datasets_from_list([df1_at_date, df2_at_date], dataset_column=dataset_column)
 
     distance_metrics = ['ks2']
@@ -203,7 +203,7 @@ def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date):
     return distance_dict['ks2']['Dataset 0 vs Dataset 1']
 
 
-def calc_famd_ks2_at_dates(df1, df2, cols_to_use, numeric_cols, calc_date_list):
+def calc_famd_ks2_at_dates(df1, df2, cols_to_use, numeric_cols, calc_date_list, dataset_column='_dataset_'):
     """
     Calculate the KS2 distance between two datasets at multiple dates.
 
@@ -217,10 +217,9 @@ def calc_famd_ks2_at_dates(df1, df2, cols_to_use, numeric_cols, calc_date_list):
     Returns:
         list(float): list of KS2 distances at each date
     """
-    dataset_column = '_dataset_'
     combined_df = combine_datasets_from_list([df1, df2], dataset_column=dataset_column)
 
-    famd_df = calc_famd_df(combined_df, cols_to_use, numeric_cols)
+    famd_df = calc_famd_df(combined_df, cols_to_use, numeric_cols, dataset_column=dataset_column)
 
     # Add date column to the DataFrame after FAMD fitting
     famd_df['date'] = combined_df['date']

From 0970b654c6036235671ee66d40dca5818b82762a Mon Sep 17 00:00:00 2001
From: Robert Tomek <robt_0@hotmail.com>
Date: Thu, 22 May 2025 19:32:43 -0500
Subject: [PATCH 5/7] Make sure non-numeric columns are strings to avoid
 potential issues with blank data in CSV/TSV files

---
 src/midrc_react/core/excel_layout.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/midrc_react/core/excel_layout.py b/src/midrc_react/core/excel_layout.py
index 7a2614e..69ac805 100644
--- a/src/midrc_react/core/excel_layout.py
+++ b/src/midrc_react/core/excel_layout.py
@@ -162,6 +162,11 @@ def build_data_frames_from_csv(self, filename: str):
         # Apply numeric column adjustments
         df = self.apply_numeric_column_adjustments(df)
 
+        # Convert all non-numeric columns to string
+        for col in self._columns:
+            if col in df.columns:
+                df[col] = df[col].astype(str)
+
         self.raw_data = df
         self.create_sheets_from_df(df)
 

From 363bc8fb0e0feb623b122323ea67ba83a5872f4f Mon Sep 17 00:00:00 2001
From: Robert Tomek <robt_0@hotmail.com>
Date: Thu, 22 May 2025 20:41:36 -0500
Subject: [PATCH 6/7] Fix some display issues for some tsv file input

---
 src/midrc_react/core/jsdcontroller.py                | 10 ++++++----
 src/midrc_react/gui/pyside6/dataselectiongroupbox.py |  2 ++
 src/midrc_react/gui/pyside6/jsdview.py               | 11 +++++------
 src/midrc_react/plugins/midrc_tsv_loader.py          | 11 +++++++++++
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/midrc_react/core/jsdcontroller.py b/src/midrc_react/core/jsdcontroller.py
index 9e65307..84c0998 100644
--- a/src/midrc_react/core/jsdcontroller.py
+++ b/src/midrc_react/core/jsdcontroller.py
@@ -96,10 +96,9 @@ def connect_signals(self):
         dataselectiongroupbox_class_name = type(jsd_view.dataselectiongroupbox).__name__
 
         if dataselectiongroupbox_class_name == 'JsdDataSelectionGroupBox':
-            for f_c in jsd_view.dataselectiongroupbox.file_comboboxes:
-                f_c.currentIndexChanged.connect(self.file_changed)
             jsd_view.dataselectiongroupbox.num_data_items_changed.connect(self.file_changed)
             jsd_view.dataselectiongroupbox.file_checkbox_state_changed.connect(self.file_changed)
+            jsd_view.dataselectiongroupbox.file_combobox_changed.connect(self.file_changed)
             jsd_view.dataselectiongroupbox.category_combobox.currentIndexChanged.connect(self.category_changed)
 
         elif dataselectiongroupbox_class_name == 'DataSelectionGroupBox':
@@ -399,12 +398,15 @@ def update_file_based_charts(self):
                 sheet_dict[i] = self.get_file_sheets_from_index(i)
 
         spider_plot_values = self.get_spider_plot_values(spider_plot_date)
-        self.jsd_view.update_spider_chart(spider_plot_values)
+        try:
+            self.jsd_view.update_spider_chart(spider_plot_values)
+        except (ValueError, KeyError, TypeError):
+            print('An error occurred during the update of the spider chart.')
 
         try:
             self.jsd_view.update_pie_chart_dock(sheet_dict)
         except (ValueError, KeyError, TypeError):
-            return False
+            print('An error occurred during the update of file-based charts.')
 
         return True
 
diff --git a/src/midrc_react/gui/pyside6/dataselectiongroupbox.py b/src/midrc_react/gui/pyside6/dataselectiongroupbox.py
index eaf9aeb..69f494b 100644
--- a/src/midrc_react/gui/pyside6/dataselectiongroupbox.py
+++ b/src/midrc_react/gui/pyside6/dataselectiongroupbox.py
@@ -38,6 +38,7 @@ class JsdDataSelectionGroupBox(QGroupBox, GroupBoxData):
     """
     num_data_items_changed: Signal = Signal(int)
     file_checkbox_state_changed: Signal = Signal(bool)
+    file_combobox_changed: Signal = Signal(int)
     NUM_DEFAULT_DATA_ITEMS: int = 2
 
     def __init__(self, data_sources):
@@ -145,6 +146,7 @@ def add_file_combobox_to_layout(self, auto_populate: bool = True):
         self.form_layout.insertRow(index - 1, new_label, new_hbox)
 
         self.file_comboboxes.append(new_combobox)
+        new_combobox.currentIndexChanged.connect(self.file_combobox_changed.emit)
         self.file_checkboxes.append(new_checkbox)
         new_checkbox.toggled.connect(self.file_checkbox_state_changed.emit)
 
diff --git a/src/midrc_react/gui/pyside6/jsdview.py b/src/midrc_react/gui/pyside6/jsdview.py
index cb8e0a1..f34a2cb 100644
--- a/src/midrc_react/gui/pyside6/jsdview.py
+++ b/src/midrc_react/gui/pyside6/jsdview.py
@@ -310,9 +310,10 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None:
         common_order: Dict[str, List[str]] = {}
         for category in categories:
             common_order[category] = []
+
             for sheets in sheet_dict.values():
                 if category in sheets:
-                    for col in sheets[category].data_columns:
+                    for col in sorted(sheets[category].data_columns, key=str.lower):
                         if col not in common_order[category]:
                             common_order[category].append(col)
 
@@ -335,8 +336,7 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None:
                 # Append any extra columns from the sheet that are not already in final_order.
                 final_order += [col for col in sheet_order if col not in final_order]
                 # Ensure 'Not Reported' is always the last column.
-                if "Not Reported" in final_order:
-                    final_order = [col for col in final_order if col != "Not Reported"] + ["Not Reported"]
+                final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none'])
 
                 series = QPieSeries()
                 for col in final_order:
@@ -517,7 +517,7 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool:
         for sheets in category.values():
             cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str
             if cat in sheets:
-                for col in sheets[cat].data_columns:
+                for col in sorted(sheets[cat].data_columns, key=str.lower):
                     if col not in common_order:
                         common_order.append(col)
 
@@ -535,8 +535,7 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool:
             # Compute final order with common ordering
             final_order = [col for col in common_order if col in sheet_order]
             final_order += [col for col in sheet_order if col not in final_order]
-            if "Not Reported" in final_order:
-                final_order = [col for col in final_order if col != "Not Reported"] + ["Not Reported"]
+            final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none'])
 
             dates: List[QDateTime] = [
                 QDateTime(numpy_datetime64_to_qdate(date), QTime())
diff --git a/src/midrc_react/plugins/midrc_tsv_loader.py b/src/midrc_react/plugins/midrc_tsv_loader.py
index 5c17c82..511b93b 100644
--- a/src/midrc_react/plugins/midrc_tsv_loader.py
+++ b/src/midrc_react/plugins/midrc_tsv_loader.py
@@ -105,10 +105,21 @@ def adjust_column_names(df):
     })
     return df
 
+def fix_nan(df):
+    """Replaces NaN values with 'Not Reported'."""
+    cols_to_nr = ['sex', 'race', 'ethnicity', 'covid19_positive', 'study_modalities', 'loinc_methods']
+    for col in cols_to_nr:
+        if col in df.columns:
+            df[col] = df[col].fillna('Not Reported')
+    if 'loinc_methods_xr' in df.columns:
+        df['loinc_methods_xr'] = df['loinc_methods_xr'].fillna('None')
+    return df
+
 
 def process_dataframe(df):
     """Applies both transformations on a pandas DataFrame."""
     df['date'] = extract_earliest_date(df['datasets.submitter_id'])
+    df = fix_nan(df)
     df = adjust_age(df)
     df = adjust_race(df)
     df = combine_race_ethnicity(df)

From 4736fade939319c9e6e224ac0d7080a0a6d36640 Mon Sep 17 00:00:00 2001
From: Robert Tomek <robt_0@hotmail.com>
Date: Fri, 23 May 2025 12:15:43 -0500
Subject: [PATCH 7/7] Use a common color pallette across all files, and across
 pie charts to area charts

---
 src/midrc_react/gui/pyside6/jsdview.py      | 69 +++++++++++++--------
 src/midrc_react/plugins/midrc_tsv_loader.py | 25 ++++++--
 2 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/src/midrc_react/gui/pyside6/jsdview.py b/src/midrc_react/gui/pyside6/jsdview.py
index f34a2cb..1f716ad 100644
--- a/src/midrc_react/gui/pyside6/jsdview.py
+++ b/src/midrc_react/gui/pyside6/jsdview.py
@@ -29,7 +29,7 @@
 from PySide6.QtCore import (
     QDateTime, QPointF, QRect, Qt, QTime, Signal,
 )
-from PySide6.QtGui import QAction, QPainter
+from PySide6.QtGui import QAction, QPainter, QPen, QBrush, QColor
 from PySide6.QtWidgets import (
     QDialog, QDialogButtonBox, QDockWidget, QFormLayout, QHBoxLayout, QHeaderView,
     QLabel, QLayout, QMainWindow, QMenu, QMenuBar, QScrollArea, QSpinBox, QSplitter,
@@ -63,6 +63,11 @@ class JsdWindow(QMainWindow, JsdViewBase):
         'pie_chart_dock': 'Pie Charts - ' + WINDOW_TITLE,
         'spider_chart_dock': 'Distribution Charts - ' + WINDOW_TITLE,
     }
+    SORT_TO_END = ['nan', 'not reported', 'none', 'missing', 'not available', 'not applicable', 'n/a']
+    # New class attribute for common palette
+    chart_palette: List[str] = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c",
+                                "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00",
+                                "#cab2d6", "#6a3d9a"]
 
     def __init__(self, data_sources: Any) -> None:
         """
@@ -317,6 +322,13 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None:
                         if col not in common_order[category]:
                             common_order[category].append(col)
 
+        common_palette: Dict[str, List[str]] = {}
+        for category in categories:
+            palette = []
+            for i, _ in enumerate(common_order[category]):
+                palette.append(JsdWindow.chart_palette[i % len(JsdWindow.chart_palette)])
+            common_palette[category] = palette
+
         timepoint: int = -1
         file_comboboxes = self.dataselectiongroupbox.file_comboboxes
         labels: List[QLabel] = JsdWindow._create_pie_chart_labels(sheet_dict, file_comboboxes)
@@ -336,13 +348,16 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None:
                 # Append any extra columns from the sheet that are not already in final_order.
                 final_order += [col for col in sheet_order if col not in final_order]
                 # Ensure 'Not Reported' is always the last column.
-                final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none'])
+                final_order.sort(key=lambda x: x.lower() in JsdWindow.SORT_TO_END)
 
                 series = QPieSeries()
                 for col in final_order:
                     value = df[col].iloc[timepoint]
                     if value > 0:
-                        series.append(col, value)
+                        slice = series.append(col, value)
+                        # Lookup common order index to get consistent color across rows.
+                        idx = common_order[category].index(col)
+                        slice.setColor(QColor(common_palette[category][idx]))
                 if not series.isEmpty():
                     row_layout.addWidget(JsdWindow._create_pie_chart_series(series, category), stretch=1)
             self.pie_chart_layout.addLayout(row_layout, stretch=1)
@@ -535,14 +550,14 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool:
             # Compute final order with common ordering
             final_order = [col for col in common_order if col in sheet_order]
             final_order += [col for col in sheet_order if col not in final_order]
-            final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none'])
+            final_order.sort(key=lambda x: x.lower() in JsdWindow.SORT_TO_END)
 
             dates: List[QDateTime] = [
                 QDateTime(numpy_datetime64_to_qdate(date), QTime())
                 for date in df.date.values
             ]
-            # Call the modified helper with the global max date.
-            JsdWindow._add_area_chart_series(area_chart, df, final_order, dates, global_max)
+            # Pass common_order for color assignment
+            JsdWindow._add_area_chart_series(area_chart, df, final_order, dates, global_max, common_order)
             # Attach axes using global min and max so that x-axis is consistent.
             JsdWindow._attach_axes_to_area_chart(area_chart, global_min, global_max)
             self.add_area_chart_view(area_chart)
@@ -551,20 +566,19 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool:
 
     @staticmethod
     def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str],
-                               dates: List[QDateTime], global_max: QDateTime) -> None:
+                               dates: List[QDateTime], global_max: QDateTime,
+                               common_order: List[str]) -> None:
         """
-        Add series to an area chart based on provided data.
-        If the last date is before global_max, an additional data point is appended with the same series value.
+        Add series to an area chart based on provided data and set its color using chart_palette.
+        If the last date is before global_max, append a point to the series.
 
         Args:
-            area_chart (QChart): The chart to update.
-            df (DataFrame): Data source for the series.
-            cols_to_use (List[str]): List of columns to plot.
-            dates (List[QDateTime]): X-axis dates for the chart.
-            global_max (QDateTime): The global maximum date.
-
-        Returns:
-            None
+            area_chart (QChart): The chart to add series to.
+            df (DataFrame): The data frame containing the data.
+            cols_to_use (List[str]): List of columns to use for the series.
+            dates (List[QDateTime]): List of dates for the x-axis.
+            global_max (QDateTime): The maximum date for the x-axis.
+            common_order (List[str]): The common order for the series.
         """
         df_cols = df[cols_to_use]
         total_counts = df_cols.sum(axis=1)
@@ -572,23 +586,27 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str],
         lower_series = None
         global_max_msecs = global_max.toMSecsSinceEpoch()
 
-        for col in cols_to_use:
+        for i, col in enumerate(cols_to_use):
             if df_cols[col].iloc[-1] == 0:
                 continue
-            points: List[QPointF] = [
-                QPointF(dates[i].toMSecsSinceEpoch(), cumulative_percents.iloc[i][col])
-                for i in range(len(dates))
-            ]
-            # If the last data point is before global_max, add an extra point.
+            points: List[QPointF] = [QPointF(dates[j].toMSecsSinceEpoch(), cumulative_percents.iloc[j][col])
+                                      for j in range(len(dates))]
             if points and points[-1].x() < global_max_msecs:
                 points.append(QPointF(global_max_msecs, points[-1].y()))
-            # In case only one date exists, also add a second point slightly offset if needed.
             if len(points) == 1:
                 points.append(QPointF(points[0].x() + 1, points[0].y()))
             upper_series: QLineSeries = QLineSeries(area_chart)
             upper_series.append(points)
             area_series: QAreaSeries = QAreaSeries(upper_series, lower_series)
             area_series.setName(col)
+            # Determine the color using common_order and chart_palette class attribute.
+            try:
+                idx = common_order.index(col)
+            except ValueError:
+                idx = i
+            color = QColor(JsdWindow.chart_palette[idx % len(JsdWindow.chart_palette)])
+            area_series.setBrush(QBrush(color))
+            area_series.setPen(QPen(color))
             area_chart.addSeries(area_series)
             lower_series = upper_series
 
@@ -785,6 +803,3 @@ def clear_layout(layout: Optional[QLayout]) -> bool:
         layout.removeItem(child)
 
     return True
-
-
-
diff --git a/src/midrc_react/plugins/midrc_tsv_loader.py b/src/midrc_react/plugins/midrc_tsv_loader.py
index 511b93b..2d6ce1b 100644
--- a/src/midrc_react/plugins/midrc_tsv_loader.py
+++ b/src/midrc_react/plugins/midrc_tsv_loader.py
@@ -106,19 +106,32 @@ def adjust_column_names(df):
     return df
 
 def fix_nan(df):
-    """Replaces NaN values with 'Not Reported'."""
-    cols_to_nr = ['sex', 'race', 'ethnicity', 'covid19_positive', 'study_modalities', 'loinc_methods']
-    for col in cols_to_nr:
+    """Replaces NaN values with e.g. 'Not Reported'."""
+    cols_to_nr = {
+        'sex': 'Not Reported',
+        'race': 'Not Reported',
+        'ethnicity': 'Not Reported',
+        'covid19_positive': 'Not Reported',
+        'study_modalities': 'Missing Data',
+        'loinc_methods': 'Missing LOINC',
+        'loinc_methods_xr': 'Missing LOINC',
+    }
+    for col, fill in cols_to_nr.items():
         if col in df.columns:
-            df[col] = df[col].fillna('Not Reported')
-    if 'loinc_methods_xr' in df.columns:
-        df['loinc_methods_xr'] = df['loinc_methods_xr'].fillna('None')
+            df[col] = df[col].fillna(fill)
     return df
 
 
+def adjust_loinc_methods(df):
+    if 'loinc_methods_xr' in df.columns and 'study_modality' in df.columns:
+        mask = df['study_modality'].notna() & df['loinc_methods_xr'].isna()
+        df.loc[mask, 'loinc_methods_xr'] = 'None'
+    return df
+
 def process_dataframe(df):
     """Applies both transformations on a pandas DataFrame."""
     df['date'] = extract_earliest_date(df['datasets.submitter_id'])
+    df = adjust_loinc_methods(df)  # Optional: Adjust loinc_methods_xr based on study_modality
     df = fix_nan(df)
     df = adjust_age(df)
     df = adjust_race(df)