Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,4 @@ seaborn>=0.13.2
ipywidgets>=8.1.5
ipython>=8.28.0
python-dateutil>=2.9.0.post0
tabulate>=0.9.0
scikit-learn>=1.6.1
30 changes: 22 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,33 @@
"PySide6>=6.6.0",
"scipy>=1.12.0",
"openpyxl>=3.1.0",
"pytest>=8.1.0",
"PyYAML>=6.0.0",
"prince>=0.15.0",
"dash>=2.18.2",
"plotly>=5.24.1",
"matplotlib>=3.9.0",
"seaborn>=0.13.2",
"ipywidgets>=8.1.5",
"ipython>=8.28.0",
"pydantic>=2.7.0",
"python-dateutil>=2.9.0.post0",
"tabulate>=0.9.0",
"scikit-learn>=1.6.1",
],
optional_dependencies={
"ipython": [
"ipywidgets>=8.1.5",
"ipython>=8.28.0",
"matplotlib>=3.9.0",
"seaborn>=0.13.2",
"plotly>=5.24.1",
],
"dash": [
"dash>=2.18.2",
"dash-bootstrap-components>=1.4.1",
"plotly>=5.24.1",
],
"dev": [
"pytest>=8.1.0",
"black>=24.9.1",
"flake8>=6.1.0",
"mypy>=1.3.0",
],
},

python_requires=">=3.10",
entry_points={
"console_scripts": [
Expand Down
12 changes: 6 additions & 6 deletions src/Untitled.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1561,7 +1561,7 @@
"import importlib\n",
"importlib.reload(ExcelLayout)\n",
"\n",
"midrc_data = ExcelLayout.DataSource('MIDRC')\n",
"midrc_data = ExcelLayout.DataSourceConfig('MIDRC')\n",
"print( midrc_data.sheets['Race'].columns.values() )\n",
"print( midrc_data.sheets['Race'].df.columns )\n",
"cols_to_use = midrc_data.sheets['Race'].df.columns.intersection(midrc_data.sheets['Race'].columns.values())\n",
Expand All @@ -1571,15 +1571,15 @@
"midrc_race_data = np.asarray(midrc_data.sheets['Race'].df[cols_to_use].iloc[-1].values,dtype=float)\n",
"print(midrc_race_data)\n",
"\n",
"cdc_data = ExcelLayout.DataSource('CDC')\n",
"cdc_data = ExcelLayout.DataSourceConfig('CDC')\n",
"#print( cdc_data.sheets['Race'].columns )\n",
"cols_to_use = cdc_data.sheets['Race'].df.columns.intersection(cdc_data.sheets['Race'].columns.values())\n",
"#Remove date column\n",
"cols_to_use = cols_to_use[1:]\n",
"cdc_race_data = np.asarray(cdc_data.sheets['Race'].df[cols_to_use].iloc[-1].values,dtype=float)\n",
"print(cdc_race_data)\n",
"\n",
"census_data = ExcelLayout.DataSource('Census')\n",
"census_data = ExcelLayout.DataSourceConfig('Census')\n",
"#print( census_data.sheets['Race'].columns.values() )\n",
"#print( census_data.sheets['Race'].df.columns )\n",
"cols_to_use = census_data.sheets['Race'].df.columns.intersection(census_data.sheets['Race'].columns.values())\n",
Expand Down Expand Up @@ -1607,17 +1607,17 @@
"\n",
"sheet_name = 'Race'\n",
"\n",
"midrc_data = ExcelLayout.DataSource('MIDRC')\n",
"midrc_data = ExcelLayout.DataSourceConfig('MIDRC')\n",
"cols_to_use = midrc_data.sheets[sheet_name].df.columns.intersection(midrc_data.sheets[sheet_name].columns.values())\n",
"cols_to_use = cols_to_use[1:]\n",
"midrc_sheet_data = np.asarray(midrc_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
"\n",
"cdc_data = ExcelLayout.DataSource('CDC')\n",
"cdc_data = ExcelLayout.DataSourceConfig('CDC')\n",
"cols_to_use = cdc_data.sheets[sheet_name].df.columns.intersection(cdc_data.sheets[sheet_name].columns.values())\n",
"cols_to_use = cols_to_use[1:]\n",
"cdc_sheet_data = np.asarray(cdc_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
"\n",
"census_data = ExcelLayout.DataSource('Census')\n",
"census_data = ExcelLayout.DataSourceConfig('Census')\n",
"cols_to_use = census_data.sheets[sheet_name].df.columns.intersection(census_data.sheets[sheet_name].columns.values())\n",
"cols_to_use = cols_to_use[1:]\n",
"census_sheet_data = np.asarray(census_data.sheets[sheet_name].df[cols_to_use].iloc[-1].values,dtype=float)\n",
Expand Down
2 changes: 1 addition & 1 deletion src/midrc_react/core/aggregate_jsd_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def calc_jsd_by_features_combined(combined_df: pd.DataFrame, cols_to_use: list[s
# Convert dataset columns to string in case they are integers
pivot_table.columns = pivot_table.columns.astype(str)

labels = combined_df[dataset_column].unique().astype(str)
labels = sorted(combined_df[dataset_column].unique().astype(str))

# Create a dictionary to hold counts for each dataset
counts_dict = {dataset: pivot_table[dataset].values if dataset in pivot_table else np.zeros(len(pivot_table)) for
Expand Down
42 changes: 22 additions & 20 deletions src/midrc_react/core/excel_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,20 @@ def __init__(self, data_source, custom_age_ranges=None):
data_source (dict): The data source configuration.
custom_age_ranges (dict, optional): A dictionary of custom age ranges.
"""
self.name = data_source['name']
self.name = data_source.name
self.sheets = {}
self.datatype = data_source['data type']
self.filename = data_source['filename']
self.datatype = data_source.data_type
self.filename = data_source.filename
self.data_source = data_source
self.custom_age_ranges = custom_age_ranges
self._numeric_cols = data_source.get('numeric_cols', {}) # Extract numeric columns from config
self._columns = data_source.get('columns', [])
self._numeric_cols = data_source.numeric_cols # Extract numeric columns from config
self._columns = data_source.columns
self.raw_data = None

# Load preprocessing plugin if specified
self.preprocessor = None
if 'plugin' in data_source and data_source['plugin']:
plugin_name = data_source['plugin']
if data_source.plugin:
plugin_name = data_source.plugin
plugin_path = os.path.join("plugins", f"{plugin_name}.py")
self.preprocessor = DataSource.load_plugin(plugin_path)

Expand All @@ -64,8 +64,8 @@ def __init__(self, data_source, custom_age_ranges=None):
self.build_data_frames_from_csv(self.filename)
else:
self.build_data_frames_from_file(self.filename)
if self.datatype == 'content' and 'content' in data_source:
self.build_data_frames_from_content(data_source['content'])
if self.datatype == 'content' and hasattr(data_source, 'content') and data_source.content is not None:
self.build_data_frames_from_content(data_source.content)

def raw_columns_to_use(self):
"""
Expand Down Expand Up @@ -126,9 +126,9 @@ def apply_numeric_column_adjustments(self, df: pd.DataFrame):
pd.DataFrame: The DataFrame with numeric column adjustments.
"""
for str_col, col_dict in self._numeric_cols.items():
num_col = col_dict['raw column'] if 'raw column' in col_dict else str_col
bins = col_dict['bins'] if 'bins' in col_dict else None
labels = col_dict['labels'] if 'labels' in col_dict else None
num_col = col_dict.raw_column if hasattr(col_dict, 'raw_column') else str_col
bins = col_dict.bins if hasattr(col_dict, 'bins') else None
labels = col_dict.labels if hasattr(col_dict, 'labels') else None

if num_col in df.columns:
df = bin_dataframe_column(df, num_col, str_col, bins=bins, labels=labels)
Expand All @@ -139,7 +139,6 @@ def apply_numeric_column_adjustments(self, df: pd.DataFrame):
# else:
# # Default "N-N" format conversion
# df[str_col] = df[num_col].apply(lambda x: f'{int(x)}-{int(x)}' if pd.notna(x) else x)

return df

def build_data_frames_from_csv(self, filename: str):
Expand Down Expand Up @@ -226,7 +225,7 @@ def create_sheets_from_df(self, df: pd.DataFrame):
if col in df.columns:
df_cumsum = self.calculate_cumulative_sums(df, col)
if col in self._numeric_cols:
labels = self._numeric_cols[col].get('labels', None)
labels = self._numeric_cols[col].labels if hasattr(self._numeric_cols[col], 'labels') else None
if labels:
# The first column (e.g., date) remains at index 0.
date_column = df_cumsum.columns[0]
Expand Down Expand Up @@ -333,25 +332,28 @@ def _process_date_column(self, data_source: dict):
"""

# This assumes that the first column is either the date column or does not have useful data
if data_source.get('date'):
date_value = getattr(data_source, 'date', None)
if date_value:
self._df.drop(self._df.columns[0], axis=1, inplace=True)
self._df.insert(0, 'date', data_source['date'], False)
self._df.insert(0, 'date', date_value, False)

self._df['date'] = pd.to_datetime(self._df['date'], errors='coerce')

self._columns['date'] = self._df.columns[0]

def _process_columns(self, data_source: dict):
def _process_columns(self, data_source):
"""
Process and rename columns according to the data source settings.

Args:
data_source (dict): The data source object.
data_source (DataSource): The data source object.
"""
for col in self._df.columns[1:]:
col_name = col
if 'remove column name text' in data_source:
for txt in data_source['remove column name text']:
# Access remove_column_name_text from pydantic model
remove_text = getattr(data_source, 'remove_column_name_text', None)
if remove_text:
for txt in remove_text:
col_name = col.split(txt)[0]
col_name = col_name.rstrip()
self._columns[col_name] = col
Expand Down
7 changes: 5 additions & 2 deletions src/midrc_react/core/famd_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import numpy as np
import pandas as pd
import prince
from tabulate import tabulate

from midrc_react.core.data_preprocessing import combine_datasets_from_list
from midrc_react.core.numeric_distances import calc_distances_via_df, scale_feature
Expand Down Expand Up @@ -132,7 +131,11 @@ def calc_famd_df(raw_df, cols_to_use, numeric_cols, dataset_column='_dataset_',
if len(outlier_df) > 0:
outlier_df = outlier_df.sort_values(by=famd_column, ascending=False)
print(f"Outliers in FAMD fitting: {outlier_df.shape[0]}")
print(tabulate(outlier_df, headers='keys', tablefmt='psql'))
try:
from tabulate import tabulate
print(tabulate(outlier_df, headers='keys', tablefmt='psql'))
except ImportError:
print(outlier_df)

return c_df

Expand Down
71 changes: 63 additions & 8 deletions src/midrc_react/core/jsdconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,64 @@
This module contains the JSDConfig class, which loads and stores data from a YAML file.
"""

from dataclasses import dataclass, field
import os
from typing import List, Optional, Dict, Union, Any

from pydantic import BaseModel, Field, ValidationError
from pydantic.dataclasses import dataclass
from yaml import load
try:
from yaml import CLoader as Loader
except ImportError:
from yaml import Loader


@dataclass
class NumericColumnConfig(BaseModel):
"""
NumericColumnConfig model to represent numeric column configurations in the YAML configuration.
"""
raw_column: str = Field(..., alias='raw column')
bins: List[float]
labels: Optional[List[str]] = None
adjust_outliers: bool = Field(False, alias='adjust outliers')

class DataSourceConfig(BaseModel):
"""
DataSource model to represent individual data sources in the YAML configuration.
"""
name: str
description: Optional[str] = None
data_type: str = Field(..., alias='data type')
filename: str
columns: Optional[List[str]] = None
numeric_cols: Optional[Dict[str, NumericColumnConfig]] = None
plugin: Optional[str] = None
date: Optional[str] = None
remove_column_name_text: Optional[List[str]] = Field(None, alias='remove column name text')

content: Optional[Any] = None # Placeholder for loaded content
content_type: Optional[str] = None # Placeholder for content type after loading

class Config:
validate_by_name = True
extra = 'allow'

DataSourceConfigList = List[DataSourceConfig]

class ConfigData(BaseModel):
"""
ConfigData model to represent the structure of the YAML configuration data.
"""
# Define fields based on expected YAML structure
data_sources: DataSourceConfigList = Field(..., alias='data sources')
custom_age_ranges: Optional[Dict[str, List[Union[int, float]]]] = Field(None, alias='custom_age_range')

class Config:
validate_by_name = True
# accept extra fields in the YAML
extra = 'allow'


class JSDConfig:
"""
The JSDConfig class loads and stores data from a YAML file.
Expand All @@ -38,13 +85,16 @@ class JSDConfig:

Methods:
__init__(self, filename='jsdconfig.yaml'): Initializes a new instance of JSDConfig.
__post_init__(self): Loads the YAML data from the current filename.
_load_data(self): Loads the YAML data from the current filename.
set_filename(self, new_filename): Sets a new filename and reloads the data.
"""
filename: str = 'jsdconfig.yaml'
data: dict = field(init=False)
filename: str
data: Optional[ConfigData]

def __post_init__(self):
def __init__(self, filename: str = 'jsdconfig.yaml'):
"""Load the YAML data from the current filename."""
self.filename = filename
self.data = None
# os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
self._load_data()

Expand All @@ -53,11 +103,16 @@ def _load_data(self):
if not os.path.exists(self.filename):
print(f"File {self.filename} does not exist. Skipping load.")
print(f"Current working directory: {os.getcwd()}")
self.data = {}
self.data = None
return

with open(self.filename, 'r', encoding='utf-8') as stream:
self.data = load(stream, Loader=Loader)
raw = load(stream, Loader=Loader)
try:
self.data = ConfigData(**raw)
except ValidationError as e:
self.data = None
raise
# print(dump(self.data))

def set_filename(self, new_filename: str):
Expand Down
Loading
Loading