From bbcf165bacf6ece65157b49738c38a7f27b54510 Mon Sep 17 00:00:00 2001 From: rtomek Date: Thu, 1 May 2025 16:50:26 -0500 Subject: [PATCH 1/7] Add context menu to copyabletableview --- docs/source/modules.rst | 6 +- jsdconfig-zipcode.yaml | 64 +++++++++++--- .../gui/pyside6/copyabletableview.py | 22 ++++- src/midrc_react/gui/pyside6/jsdview.py | 87 +++++++++++++++---- src/midrc_react/plugins/midrc_tsv_loader.py | 2 +- 5 files changed, 144 insertions(+), 37 deletions(-) diff --git a/docs/source/modules.rst b/docs/source/modules.rst index ba39b8e..952175e 100644 --- a/docs/source/modules.rst +++ b/docs/source/modules.rst @@ -1,7 +1,7 @@ -midrc_react -=========== +src +=== .. toctree:: - :maxdepth: 8 + :maxdepth: 7 midrc_react diff --git a/jsdconfig-zipcode.yaml b/jsdconfig-zipcode.yaml index 0f3ee9d..99d8133 100644 --- a/jsdconfig-zipcode.yaml +++ b/jsdconfig-zipcode.yaml @@ -1,10 +1,22 @@ data sources: # The data sources will be loaded in the order they are populated here - name: MIDRC - description: MIDRC Excel File + description: MIDRC TSV File data type: file - filename: data/MIDRC Open A1 and R1 - cumulative by batch.xlsx - remove column name text: [(CUSUM)] + filename: data/midrc_data_download-2025-01-29.tsv + columns: + - Age at Index + - Ethnicity + - Race + - Sex + - COVID-19 Positive + - Race and Ethnicity + numeric_cols: + Age at Index: + raw column: age_at_index + bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ] + labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ] + plugin: midrc_tsv_loader - name: CDC description: CDC Excel File @@ -18,13 +30,25 @@ data sources: date: '2020-01-01' # The census file does not have a date column, so we specify the date here - name: MIDRC COVID+ - description: MIDRC COVID+ Excel File + description: MIDRC COVID+ TSV File data type: file - filename: data/MIDRC Open A1 and R1 COVIDpos only - cumulative by batch.xlsx - remove column name text: [(CUSUM)] + filename: data/midrc_data_download-2025-01-29_covid_pos.tsv + columns: + - Age at Index + - Ethnicity + - Race + - Sex + - COVID-19 Positive + - Race and Ethnicity + numeric_cols: + Age at Index: + raw column: age_at_index + bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ] + labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ] + plugin: midrc_tsv_loader - name: Zip Code 1 - description: Zip Code 1 Excel File + description: Zip Code 1 TSV File data type: file filename: data/midrc_data_download-2025-01-29_0.tsv columns: @@ -32,17 +56,17 @@ data sources: - Ethnicity - Race - Sex - - Covid19 Positive + - COVID-19 Positive - Race and Ethnicity numeric_cols: Age at Index: raw column: age_at_index - bins: [ 0, 17, 50, 65, 1000 ] - labels: ['0-17', "18-49", '50-64', '65+'] + bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ] + labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ] plugin: midrc_tsv_loader - name: Zip Code 2 - description: Zip Code 2 Excel File + description: Zip Code 2 TSV File data type: file filename: data/midrc_data_download-2025-01-29_1.tsv columns: @@ -50,15 +74,27 @@ data sources: - Ethnicity - Race - Sex - - Covid19 Positive + - COVID-19 Positive - Race and Ethnicity numeric_cols: Age at Index: raw column: age_at_index - bins: [ 0, 17, 50, 65, 1000 ] - labels: ['0-17', "18-49", '50-64', '65+'] + bins: [ 0, 5, 12, 16, 18, 30, 40, 50, 65, 75, 85, 1000 ] + labels: [ '0-4', '5-11', '12-15', '16-17', '18-29', '30-39', '40-49', '50-64', '65-74', '75-84', '85+' ] plugin: midrc_tsv_loader + - name: MIDRC COVID+ + description: MIDRC COVID+ Excel File + data type: file + filename: data/MIDRC Open A1 and R1 COVIDpos only - cumulative by batch.xlsx + remove column name text: [(CUSUM)] + + - name: MIDRC + description: MIDRC Excel File + data type: file + filename: data/MIDRC Open A1 and R1 - cumulative by batch.xlsx + remove column name text: [(CUSUM)] + # TODO: The following should be moved into QSettings for modifications within the GUI # For custom age columns, please use .inf as the maximum age in the final age group custom age ranges: diff --git a/src/midrc_react/gui/pyside6/copyabletableview.py b/src/midrc_react/gui/pyside6/copyabletableview.py index f3cb80e..9472871 100644 --- a/src/midrc_react/gui/pyside6/copyabletableview.py +++ b/src/midrc_react/gui/pyside6/copyabletableview.py @@ -23,7 +23,7 @@ from PySide6.QtCore import QDate, QEvent, QObject, Qt from PySide6.QtGui import QGuiApplication, QKeySequence -from PySide6.QtWidgets import QTableView +from PySide6.QtWidgets import QTableView, QMenu class CopyableTableView(QTableView): @@ -78,3 +78,23 @@ def copy_selection(self) -> None: stream = io.StringIO() csv.writer(stream, delimiter='\t').writerows(table) QGuiApplication.clipboard().setText(stream.getvalue()) + + def contextMenuEvent(self, event) -> None: + """ + Create a context menu with 'Select All' and 'Copy' options on right-click. + + Args: + event (QContextMenuEvent): The context menu event. + + Returns: + None + """ + menu = QMenu(self) + select_all_action = menu.addAction("Select All") + copy_action = menu.addAction("Copy") + + action = menu.exec(event.globalPos()) + if action == select_all_action: + self.selectAll() + elif action == copy_action: + self.copy_selection() diff --git a/src/midrc_react/gui/pyside6/jsdview.py b/src/midrc_react/gui/pyside6/jsdview.py index 473029f..4c22096 100644 --- a/src/midrc_react/gui/pyside6/jsdview.py +++ b/src/midrc_react/gui/pyside6/jsdview.py @@ -455,7 +455,7 @@ def update_spider_chart_title(self, file1_data: str = 'File 1', file2_data: str def update_area_chart(self, category: Dict[Any, Any]) -> bool: """ - Update the area chart with new data from the provided sheets. + Update the area chart with new data from the provided sheets, ensuring the series follow a common order. Args: category (dict): A dictionary where each key maps to a sheet containing chart data. @@ -466,36 +466,80 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool: category_str: str = self._dataselectiongroupbox.category_combobox.currentText() clear_layout(self.area_chart_widget.layout()) + # Compute the global minimum and maximum dates across all sheets + global_min = None + global_max = None + for sheets in category.values(): + cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str + if cat in sheets: + df = sheets[cat].df + dates_list = [ + QDateTime(numpy_datetime64_to_qdate(date), QTime()) + for date in df.date.values + ] + if dates_list: + local_min = min(dates_list, key=lambda d: d.toMSecsSinceEpoch()) + local_max = max(dates_list, key=lambda d: d.toMSecsSinceEpoch()) + if global_min is None or local_min < global_min: + global_min = local_min + if global_max is None or local_max > global_max: + global_max = local_max + + # If no dates found, return early. + if global_min is None or global_max is None: + return False + + # Compute the union order (common order) for the category across all sheets. + common_order: List[str] = [] + for sheets in category.values(): + cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str + if cat in sheets: + for col in sheets[cat].data_columns: + if col not in common_order: + common_order.append(col) + for index, sheets in category.items(): + cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str area_chart: QChart = QChart() filename: str = self.dataselectiongroupbox.file_comboboxes[index].currentData() - area_chart.setTitle(f'{filename} {category_str} distribution over time') + area_chart.setTitle(f"{filename} {cat} distribution over time") - if category_str.endswith(' (ks2)'): - category_str = category_str[:-6] - if category_str not in sheets: + if cat not in sheets: continue - df = sheets[category_str].df - cols_to_use = sheets[category_str].data_columns - dates: List[QDateTime] = [QDateTime(numpy_datetime64_to_qdate(date), QTime()) for date in df.date.values] - - JsdWindow._add_area_chart_series(area_chart, df, cols_to_use, dates) - JsdWindow._attach_axes_to_area_chart(area_chart, dates) + df = sheets[cat].df + sheet_order = sheets[cat].data_columns + # Compute final order with common ordering + final_order = [col for col in common_order if col in sheet_order] + final_order += [col for col in sheet_order if col not in final_order] + if "Not Reported" in final_order: + final_order = [col for col in final_order if col != "Not Reported"] + ["Not Reported"] + + dates: List[QDateTime] = [ + QDateTime(numpy_datetime64_to_qdate(date), QTime()) + for date in df.date.values + ] + # Call the modified helper with the global max date. + JsdWindow._add_area_chart_series(area_chart, df, final_order, dates, global_max) + # Attach axes using global min and max so that x-axis is consistent. + JsdWindow._attach_axes_to_area_chart(area_chart, global_min, global_max) self.add_area_chart_view(area_chart) return True @staticmethod - def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], dates: List[QDateTime]) -> None: + def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], + dates: List[QDateTime], global_max: QDateTime) -> None: """ Add series to an area chart based on provided data. + If the last date is before global_max, an additional data point is appended with the same series value. Args: area_chart (QChart): The chart to update. df (DataFrame): Data source for the series. cols_to_use (List[str]): List of columns to plot. dates (List[QDateTime]): X-axis dates for the chart. + global_max (QDateTime): The global maximum date. Returns: None @@ -504,6 +548,7 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], total_counts = df_cols.sum(axis=1) cumulative_percents = 100.0 * df_cols.cumsum(axis=1).div(total_counts, axis=0) lower_series = None + global_max_msecs = global_max.toMSecsSinceEpoch() for col in cols_to_use: if df_cols[col].iloc[-1] == 0: @@ -512,8 +557,12 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], QPointF(dates[i].toMSecsSinceEpoch(), cumulative_percents.iloc[i][col]) for i in range(len(dates)) ] - if len(dates) == 1: - points.append(QPointF(dates[0].toMSecsSinceEpoch() + 1, cumulative_percents.iloc[0][col])) + # If the last data point is before global_max, add an extra point. + if points and points[-1].x() < global_max_msecs: + points.append(QPointF(global_max_msecs, points[-1].y())) + # In case only one date exists, also add a second point slightly offset if needed. + if len(points) == 1: + points.append(QPointF(points[0].x() + 1, points[0].y())) upper_series: QLineSeries = QLineSeries(area_chart) upper_series.append(points) area_series: QAreaSeries = QAreaSeries(upper_series, lower_series) @@ -522,13 +571,15 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], lower_series = upper_series @staticmethod - def _attach_axes_to_area_chart(area_chart: QChart, dates: List[QDateTime]) -> None: + def _attach_axes_to_area_chart(area_chart: QChart, global_min: QDateTime, + global_max: QDateTime) -> None: """ - Attach X and Y axes to an area chart. + Attach X and Y axes to an area chart using global min and max dates for the X-axis. Args: area_chart (QChart): The chart to attach axes to. - dates (List[QDateTime]): List of dates for the X-axis. + global_min (QDateTime): The global minimum date. + global_max (QDateTime): The global maximum date. Returns: None @@ -537,7 +588,7 @@ def _attach_axes_to_area_chart(area_chart: QChart, dates: List[QDateTime]) -> No axis_x.setTickCount(10) axis_x.setFormat("MMM yyyy") axis_x.setTitleText("Date") - axis_x.setRange(dates[0], dates[-1] if len(dates) > 1 else dates[0].addMSecs(1)) + axis_x.setRange(global_min, global_max) area_chart.addAxis(axis_x, Qt.AlignBottom) axis_y: QValueAxis = QValueAxis() diff --git a/src/midrc_react/plugins/midrc_tsv_loader.py b/src/midrc_react/plugins/midrc_tsv_loader.py index dee9af0..5c17c82 100644 --- a/src/midrc_react/plugins/midrc_tsv_loader.py +++ b/src/midrc_react/plugins/midrc_tsv_loader.py @@ -87,7 +87,7 @@ def classify(row): if ethnicity == 'Hispanic or Latino': return ethnicity - if race == 'Not Reported' or ethnicity == 'Not Reported' or pd.isna(race) or pd.isna(ethnicity): + if race == 'Not Reported' or ethnicity == 'Not Reported': # or pd.isna(race) or pd.isna(ethnicity): return 'Not Reported' return f'{race}, {ethnicity}' From 7d246494a3a90a0f2db12c6854bde6d78891903e Mon Sep 17 00:00:00 2001 From: Robert Tomek Date: Thu, 22 May 2025 15:00:45 -0500 Subject: [PATCH 2/7] fix potential issues with data types --- src/midrc_react/core/data_preprocessing.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/midrc_react/core/data_preprocessing.py b/src/midrc_react/core/data_preprocessing.py index f22d9f7..a5b61a3 100644 --- a/src/midrc_react/core/data_preprocessing.py +++ b/src/midrc_react/core/data_preprocessing.py @@ -60,8 +60,12 @@ def _adjust_outliers(df: pd.DataFrame, cut_column_name: str, column_name: str, b low_text = "Outlier_Low" high_text = "Outlier_High" print(f"WARNING: There are values outside the bins specified for the '{column_name}' column.") - df.loc[df[cut_column_name].isna() & (df[column_name] < bins[0]), cut_column_name] = low_text - df.loc[df[cut_column_name].isna() & (df[column_name] >= bins[-1]), cut_column_name] = high_text + + # Only compare numeric values, ignore strings or other types + col_numeric = pd.to_numeric(df[column_name], errors='coerce') + + df.loc[df[cut_column_name].isna() & (col_numeric < bins[0]), cut_column_name] = low_text + df.loc[df[cut_column_name].isna() & (col_numeric >= bins[-1]), cut_column_name] = high_text df.loc[df[cut_column_name].isna(), cut_column_name] = new_text if (df[cut_column_name] == low_text).sum() > 0: print(f" {(df[cut_column_name] == low_text).sum()} values are below the min bin value.\n" From 9cb23646b407e81aacb298d215e5719aabbf5a56 Mon Sep 17 00:00:00 2001 From: Robert Tomek Date: Thu, 22 May 2025 16:12:04 -0500 Subject: [PATCH 3/7] fix potential bug with dataset column --- src/midrc_react/core/famd_calc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/midrc_react/core/famd_calc.py b/src/midrc_react/core/famd_calc.py index 19fe6e4..5468463 100644 --- a/src/midrc_react/core/famd_calc.py +++ b/src/midrc_react/core/famd_calc.py @@ -165,7 +165,11 @@ def calc_famd_distances(df, cols_to_use, numeric_cols, dataset_column='_dataset_ dict: Dictionary of distance values specified in distance_metrics for each dataset combination. """ - return calc_distances_via_df(calc_famd_df(df, cols_to_use, numeric_cols, print_outliers=print_outliers), + return calc_distances_via_df(calc_famd_df(df, + cols_to_use, + numeric_cols, + dataset_column, + print_outliers=print_outliers), 'famd_x_coordinates', dataset_column, distance_metrics=distance_metrics, From f228c5e14ffe79200b160f478a8db11d1a6794e2 Mon Sep 17 00:00:00 2001 From: Robert Tomek Date: Thu, 22 May 2025 18:04:22 -0500 Subject: [PATCH 4/7] Small tweaks to how the dataset column is handled for consistency --- src/midrc_react/core/aggregate_jsd_calc.py | 32 ++++++++++++---------- src/midrc_react/core/famd_calc.py | 13 ++++----- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/src/midrc_react/core/aggregate_jsd_calc.py b/src/midrc_react/core/aggregate_jsd_calc.py index 5a9168e..e8c1a9c 100644 --- a/src/midrc_react/core/aggregate_jsd_calc.py +++ b/src/midrc_react/core/aggregate_jsd_calc.py @@ -62,6 +62,22 @@ def calc_jsd_from_counts_dict(counts_dict, dataset_names): return output_dict +def calc_jsd_by_features_combined(combined_df: pd.DataFrame, cols_to_use: list[str], dataset_column) -> dict[str, float]: + # Pivot table to get counts for each combination + pivot_table = combined_df.pivot_table(index=cols_to_use, columns=dataset_column, aggfunc='size', fill_value=0) + pivot_table = pivot_table.reset_index() + + # Convert dataset columns to string in case they are integers + pivot_table.columns = pivot_table.columns.astype(str) + + labels = combined_df[dataset_column].unique().astype(str) + + # Create a dictionary to hold counts for each dataset + counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for + dataset in labels} + + return calc_jsd_from_counts_dict(counts_dict, labels) + def calc_jsd_by_features(df_list: list[pd.DataFrame], cols_to_use: list[str]) -> dict[str, float]: """ @@ -76,21 +92,7 @@ def calc_jsd_by_features(df_list: list[pd.DataFrame], cols_to_use: list[str]) -> """ dataset_column = '_dataset_' # Temporary column name to store dataset information combined_df = combine_datasets_from_list(df_list, dataset_column) - - # Pivot table to get counts for each combination - pivot_table = combined_df.pivot_table(index=cols_to_use, columns=dataset_column, aggfunc='size', fill_value=0) - pivot_table = pivot_table.reset_index() - - # Convert dataset columns to string in case they are integers - pivot_table.columns = pivot_table.columns.astype(str) - - labels = combined_df[dataset_column].unique() - - # Create a dictionary to hold counts for each dataset - counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for - dataset in labels} - - return calc_jsd_from_counts_dict(counts_dict, labels) + return calc_jsd_by_features_combined(combined_df, cols_to_use, dataset_column) def calc_jsd_by_features_2df(df1: pd.DataFrame, df2: pd.DataFrame, cols_to_use: list[str]) -> float: diff --git a/src/midrc_react/core/famd_calc.py b/src/midrc_react/core/famd_calc.py index 5468463..5889857 100644 --- a/src/midrc_react/core/famd_calc.py +++ b/src/midrc_react/core/famd_calc.py @@ -169,15 +169,16 @@ def calc_famd_distances(df, cols_to_use, numeric_cols, dataset_column='_dataset_ cols_to_use, numeric_cols, dataset_column, - print_outliers=print_outliers), + print_outliers=print_outliers + ), 'famd_x_coordinates', - dataset_column, + dataset_column=dataset_column, distance_metrics=distance_metrics, jsd_scaled_bin_width=jsd_scaled_bin_width, ) -def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date): +def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date, dataset_column='_dataset_'): """ Calculate the KS2 distance between two datasets at a specific date. @@ -194,7 +195,6 @@ def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date): df1_at_date = df1[df1['date'] <= calc_date] df2_at_date = df2[df2['date'] <= calc_date] - dataset_column = '_dataset_' combined_df = combine_datasets_from_list([df1_at_date, df2_at_date], dataset_column=dataset_column) distance_metrics = ['ks2'] @@ -203,7 +203,7 @@ def calc_famd_ks2_at_date(df1, df2, cols_to_use, numeric_cols, calc_date): return distance_dict['ks2']['Dataset 0 vs Dataset 1'] -def calc_famd_ks2_at_dates(df1, df2, cols_to_use, numeric_cols, calc_date_list): +def calc_famd_ks2_at_dates(df1, df2, cols_to_use, numeric_cols, calc_date_list, dataset_column='_dataset_'): """ Calculate the KS2 distance between two datasets at multiple dates. @@ -217,10 +217,9 @@ def calc_famd_ks2_at_dates(df1, df2, cols_to_use, numeric_cols, calc_date_list): Returns: list(float): list of KS2 distances at each date """ - dataset_column = '_dataset_' combined_df = combine_datasets_from_list([df1, df2], dataset_column=dataset_column) - famd_df = calc_famd_df(combined_df, cols_to_use, numeric_cols) + famd_df = calc_famd_df(combined_df, cols_to_use, numeric_cols, dataset_column=dataset_column) # Add date column to the DataFrame after FAMD fitting famd_df['date'] = combined_df['date'] From 0970b654c6036235671ee66d40dca5818b82762a Mon Sep 17 00:00:00 2001 From: Robert Tomek Date: Thu, 22 May 2025 19:32:43 -0500 Subject: [PATCH 5/7] Make sure non-numeric columns are strings to avoid potential issues with blank data in CSV/TSV files --- src/midrc_react/core/excel_layout.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/midrc_react/core/excel_layout.py b/src/midrc_react/core/excel_layout.py index 7a2614e..69ac805 100644 --- a/src/midrc_react/core/excel_layout.py +++ b/src/midrc_react/core/excel_layout.py @@ -162,6 +162,11 @@ def build_data_frames_from_csv(self, filename: str): # Apply numeric column adjustments df = self.apply_numeric_column_adjustments(df) + # Convert all non-numeric columns to string + for col in self._columns: + if col in df.columns: + df[col] = df[col].astype(str) + self.raw_data = df self.create_sheets_from_df(df) From 363bc8fb0e0feb623b122323ea67ba83a5872f4f Mon Sep 17 00:00:00 2001 From: Robert Tomek Date: Thu, 22 May 2025 20:41:36 -0500 Subject: [PATCH 6/7] Fix some display issues for some tsv file input --- src/midrc_react/core/jsdcontroller.py | 10 ++++++---- src/midrc_react/gui/pyside6/dataselectiongroupbox.py | 2 ++ src/midrc_react/gui/pyside6/jsdview.py | 11 +++++------ src/midrc_react/plugins/midrc_tsv_loader.py | 11 +++++++++++ 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/midrc_react/core/jsdcontroller.py b/src/midrc_react/core/jsdcontroller.py index 9e65307..84c0998 100644 --- a/src/midrc_react/core/jsdcontroller.py +++ b/src/midrc_react/core/jsdcontroller.py @@ -96,10 +96,9 @@ def connect_signals(self): dataselectiongroupbox_class_name = type(jsd_view.dataselectiongroupbox).__name__ if dataselectiongroupbox_class_name == 'JsdDataSelectionGroupBox': - for f_c in jsd_view.dataselectiongroupbox.file_comboboxes: - f_c.currentIndexChanged.connect(self.file_changed) jsd_view.dataselectiongroupbox.num_data_items_changed.connect(self.file_changed) jsd_view.dataselectiongroupbox.file_checkbox_state_changed.connect(self.file_changed) + jsd_view.dataselectiongroupbox.file_combobox_changed.connect(self.file_changed) jsd_view.dataselectiongroupbox.category_combobox.currentIndexChanged.connect(self.category_changed) elif dataselectiongroupbox_class_name == 'DataSelectionGroupBox': @@ -399,12 +398,15 @@ def update_file_based_charts(self): sheet_dict[i] = self.get_file_sheets_from_index(i) spider_plot_values = self.get_spider_plot_values(spider_plot_date) - self.jsd_view.update_spider_chart(spider_plot_values) + try: + self.jsd_view.update_spider_chart(spider_plot_values) + except (ValueError, KeyError, TypeError): + print('An error occurred during the update of the spider chart.') try: self.jsd_view.update_pie_chart_dock(sheet_dict) except (ValueError, KeyError, TypeError): - return False + print('An error occurred during the update of file-based charts.') return True diff --git a/src/midrc_react/gui/pyside6/dataselectiongroupbox.py b/src/midrc_react/gui/pyside6/dataselectiongroupbox.py index eaf9aeb..69f494b 100644 --- a/src/midrc_react/gui/pyside6/dataselectiongroupbox.py +++ b/src/midrc_react/gui/pyside6/dataselectiongroupbox.py @@ -38,6 +38,7 @@ class JsdDataSelectionGroupBox(QGroupBox, GroupBoxData): """ num_data_items_changed: Signal = Signal(int) file_checkbox_state_changed: Signal = Signal(bool) + file_combobox_changed: Signal = Signal(int) NUM_DEFAULT_DATA_ITEMS: int = 2 def __init__(self, data_sources): @@ -145,6 +146,7 @@ def add_file_combobox_to_layout(self, auto_populate: bool = True): self.form_layout.insertRow(index - 1, new_label, new_hbox) self.file_comboboxes.append(new_combobox) + new_combobox.currentIndexChanged.connect(self.file_combobox_changed.emit) self.file_checkboxes.append(new_checkbox) new_checkbox.toggled.connect(self.file_checkbox_state_changed.emit) diff --git a/src/midrc_react/gui/pyside6/jsdview.py b/src/midrc_react/gui/pyside6/jsdview.py index cb8e0a1..f34a2cb 100644 --- a/src/midrc_react/gui/pyside6/jsdview.py +++ b/src/midrc_react/gui/pyside6/jsdview.py @@ -310,9 +310,10 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None: common_order: Dict[str, List[str]] = {} for category in categories: common_order[category] = [] + for sheets in sheet_dict.values(): if category in sheets: - for col in sheets[category].data_columns: + for col in sorted(sheets[category].data_columns, key=str.lower): if col not in common_order[category]: common_order[category].append(col) @@ -335,8 +336,7 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None: # Append any extra columns from the sheet that are not already in final_order. final_order += [col for col in sheet_order if col not in final_order] # Ensure 'Not Reported' is always the last column. - if "Not Reported" in final_order: - final_order = [col for col in final_order if col != "Not Reported"] + ["Not Reported"] + final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none']) series = QPieSeries() for col in final_order: @@ -517,7 +517,7 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool: for sheets in category.values(): cat = category_str[:-6] if category_str.endswith(" (ks2)") else category_str if cat in sheets: - for col in sheets[cat].data_columns: + for col in sorted(sheets[cat].data_columns, key=str.lower): if col not in common_order: common_order.append(col) @@ -535,8 +535,7 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool: # Compute final order with common ordering final_order = [col for col in common_order if col in sheet_order] final_order += [col for col in sheet_order if col not in final_order] - if "Not Reported" in final_order: - final_order = [col for col in final_order if col != "Not Reported"] + ["Not Reported"] + final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none']) dates: List[QDateTime] = [ QDateTime(numpy_datetime64_to_qdate(date), QTime()) diff --git a/src/midrc_react/plugins/midrc_tsv_loader.py b/src/midrc_react/plugins/midrc_tsv_loader.py index 5c17c82..511b93b 100644 --- a/src/midrc_react/plugins/midrc_tsv_loader.py +++ b/src/midrc_react/plugins/midrc_tsv_loader.py @@ -105,10 +105,21 @@ def adjust_column_names(df): }) return df +def fix_nan(df): + """Replaces NaN values with 'Not Reported'.""" + cols_to_nr = ['sex', 'race', 'ethnicity', 'covid19_positive', 'study_modalities', 'loinc_methods'] + for col in cols_to_nr: + if col in df.columns: + df[col] = df[col].fillna('Not Reported') + if 'loinc_methods_xr' in df.columns: + df['loinc_methods_xr'] = df['loinc_methods_xr'].fillna('None') + return df + def process_dataframe(df): """Applies both transformations on a pandas DataFrame.""" df['date'] = extract_earliest_date(df['datasets.submitter_id']) + df = fix_nan(df) df = adjust_age(df) df = adjust_race(df) df = combine_race_ethnicity(df) From 4736fade939319c9e6e224ac0d7080a0a6d36640 Mon Sep 17 00:00:00 2001 From: Robert Tomek Date: Fri, 23 May 2025 12:15:43 -0500 Subject: [PATCH 7/7] Use a common color pallette across all files, and across pie charts to area charts --- src/midrc_react/gui/pyside6/jsdview.py | 69 +++++++++++++-------- src/midrc_react/plugins/midrc_tsv_loader.py | 25 ++++++-- 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/src/midrc_react/gui/pyside6/jsdview.py b/src/midrc_react/gui/pyside6/jsdview.py index f34a2cb..1f716ad 100644 --- a/src/midrc_react/gui/pyside6/jsdview.py +++ b/src/midrc_react/gui/pyside6/jsdview.py @@ -29,7 +29,7 @@ from PySide6.QtCore import ( QDateTime, QPointF, QRect, Qt, QTime, Signal, ) -from PySide6.QtGui import QAction, QPainter +from PySide6.QtGui import QAction, QPainter, QPen, QBrush, QColor from PySide6.QtWidgets import ( QDialog, QDialogButtonBox, QDockWidget, QFormLayout, QHBoxLayout, QHeaderView, QLabel, QLayout, QMainWindow, QMenu, QMenuBar, QScrollArea, QSpinBox, QSplitter, @@ -63,6 +63,11 @@ class JsdWindow(QMainWindow, JsdViewBase): 'pie_chart_dock': 'Pie Charts - ' + WINDOW_TITLE, 'spider_chart_dock': 'Distribution Charts - ' + WINDOW_TITLE, } + SORT_TO_END = ['nan', 'not reported', 'none', 'missing', 'not available', 'not applicable', 'n/a'] + # New class attribute for common palette + chart_palette: List[str] = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", + "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", + "#cab2d6", "#6a3d9a"] def __init__(self, data_sources: Any) -> None: """ @@ -317,6 +322,13 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None: if col not in common_order[category]: common_order[category].append(col) + common_palette: Dict[str, List[str]] = {} + for category in categories: + palette = [] + for i, _ in enumerate(common_order[category]): + palette.append(JsdWindow.chart_palette[i % len(JsdWindow.chart_palette)]) + common_palette[category] = palette + timepoint: int = -1 file_comboboxes = self.dataselectiongroupbox.file_comboboxes labels: List[QLabel] = JsdWindow._create_pie_chart_labels(sheet_dict, file_comboboxes) @@ -336,13 +348,16 @@ def update_pie_chart_dock(self, sheet_dict: Dict[Any, Any]) -> None: # Append any extra columns from the sheet that are not already in final_order. final_order += [col for col in sheet_order if col not in final_order] # Ensure 'Not Reported' is always the last column. - final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none']) + final_order.sort(key=lambda x: x.lower() in JsdWindow.SORT_TO_END) series = QPieSeries() for col in final_order: value = df[col].iloc[timepoint] if value > 0: - series.append(col, value) + slice = series.append(col, value) + # Lookup common order index to get consistent color across rows. + idx = common_order[category].index(col) + slice.setColor(QColor(common_palette[category][idx])) if not series.isEmpty(): row_layout.addWidget(JsdWindow._create_pie_chart_series(series, category), stretch=1) self.pie_chart_layout.addLayout(row_layout, stretch=1) @@ -535,14 +550,14 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool: # Compute final order with common ordering final_order = [col for col in common_order if col in sheet_order] final_order += [col for col in sheet_order if col not in final_order] - final_order.sort(key=lambda x: x.lower() in ['nan', 'not reported', 'none']) + final_order.sort(key=lambda x: x.lower() in JsdWindow.SORT_TO_END) dates: List[QDateTime] = [ QDateTime(numpy_datetime64_to_qdate(date), QTime()) for date in df.date.values ] - # Call the modified helper with the global max date. - JsdWindow._add_area_chart_series(area_chart, df, final_order, dates, global_max) + # Pass common_order for color assignment + JsdWindow._add_area_chart_series(area_chart, df, final_order, dates, global_max, common_order) # Attach axes using global min and max so that x-axis is consistent. JsdWindow._attach_axes_to_area_chart(area_chart, global_min, global_max) self.add_area_chart_view(area_chart) @@ -551,20 +566,19 @@ def update_area_chart(self, category: Dict[Any, Any]) -> bool: @staticmethod def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], - dates: List[QDateTime], global_max: QDateTime) -> None: + dates: List[QDateTime], global_max: QDateTime, + common_order: List[str]) -> None: """ - Add series to an area chart based on provided data. - If the last date is before global_max, an additional data point is appended with the same series value. + Add series to an area chart based on provided data and set its color using chart_palette. + If the last date is before global_max, append a point to the series. Args: - area_chart (QChart): The chart to update. - df (DataFrame): Data source for the series. - cols_to_use (List[str]): List of columns to plot. - dates (List[QDateTime]): X-axis dates for the chart. - global_max (QDateTime): The global maximum date. - - Returns: - None + area_chart (QChart): The chart to add series to. + df (DataFrame): The data frame containing the data. + cols_to_use (List[str]): List of columns to use for the series. + dates (List[QDateTime]): List of dates for the x-axis. + global_max (QDateTime): The maximum date for the x-axis. + common_order (List[str]): The common order for the series. """ df_cols = df[cols_to_use] total_counts = df_cols.sum(axis=1) @@ -572,23 +586,27 @@ def _add_area_chart_series(area_chart: QChart, df: Any, cols_to_use: List[str], lower_series = None global_max_msecs = global_max.toMSecsSinceEpoch() - for col in cols_to_use: + for i, col in enumerate(cols_to_use): if df_cols[col].iloc[-1] == 0: continue - points: List[QPointF] = [ - QPointF(dates[i].toMSecsSinceEpoch(), cumulative_percents.iloc[i][col]) - for i in range(len(dates)) - ] - # If the last data point is before global_max, add an extra point. + points: List[QPointF] = [QPointF(dates[j].toMSecsSinceEpoch(), cumulative_percents.iloc[j][col]) + for j in range(len(dates))] if points and points[-1].x() < global_max_msecs: points.append(QPointF(global_max_msecs, points[-1].y())) - # In case only one date exists, also add a second point slightly offset if needed. if len(points) == 1: points.append(QPointF(points[0].x() + 1, points[0].y())) upper_series: QLineSeries = QLineSeries(area_chart) upper_series.append(points) area_series: QAreaSeries = QAreaSeries(upper_series, lower_series) area_series.setName(col) + # Determine the color using common_order and chart_palette class attribute. + try: + idx = common_order.index(col) + except ValueError: + idx = i + color = QColor(JsdWindow.chart_palette[idx % len(JsdWindow.chart_palette)]) + area_series.setBrush(QBrush(color)) + area_series.setPen(QPen(color)) area_chart.addSeries(area_series) lower_series = upper_series @@ -785,6 +803,3 @@ def clear_layout(layout: Optional[QLayout]) -> bool: layout.removeItem(child) return True - - - diff --git a/src/midrc_react/plugins/midrc_tsv_loader.py b/src/midrc_react/plugins/midrc_tsv_loader.py index 511b93b..2d6ce1b 100644 --- a/src/midrc_react/plugins/midrc_tsv_loader.py +++ b/src/midrc_react/plugins/midrc_tsv_loader.py @@ -106,19 +106,32 @@ def adjust_column_names(df): return df def fix_nan(df): - """Replaces NaN values with 'Not Reported'.""" - cols_to_nr = ['sex', 'race', 'ethnicity', 'covid19_positive', 'study_modalities', 'loinc_methods'] - for col in cols_to_nr: + """Replaces NaN values with e.g. 'Not Reported'.""" + cols_to_nr = { + 'sex': 'Not Reported', + 'race': 'Not Reported', + 'ethnicity': 'Not Reported', + 'covid19_positive': 'Not Reported', + 'study_modalities': 'Missing Data', + 'loinc_methods': 'Missing LOINC', + 'loinc_methods_xr': 'Missing LOINC', + } + for col, fill in cols_to_nr.items(): if col in df.columns: - df[col] = df[col].fillna('Not Reported') - if 'loinc_methods_xr' in df.columns: - df['loinc_methods_xr'] = df['loinc_methods_xr'].fillna('None') + df[col] = df[col].fillna(fill) return df +def adjust_loinc_methods(df): + if 'loinc_methods_xr' in df.columns and 'study_modality' in df.columns: + mask = df['study_modality'].notna() & df['loinc_methods_xr'].isna() + df.loc[mask, 'loinc_methods_xr'] = 'None' + return df + def process_dataframe(df): """Applies both transformations on a pandas DataFrame.""" df['date'] = extract_earliest_date(df['datasets.submitter_id']) + df = adjust_loinc_methods(df) # Optional: Adjust loinc_methods_xr based on study_modality df = fix_nan(df) df = adjust_age(df) df = adjust_race(df)