diff --git a/.all-contributorsrc b/.all-contributorsrc
new file mode 100644
index 0000000..585a78c
--- /dev/null
+++ b/.all-contributorsrc
@@ -0,0 +1,47 @@
+{
+ "files": [
+ "README.md"
+ ],
+ "imageSize": 100,
+ "commit": false,
+ "commitType": "docs",
+ "commitConvention": "angular",
+ "contributors": [
+ {
+ "login": "danielawitten",
+ "name": "danielawitten",
+ "avatar_url": "https://avatars.githubusercontent.com/u/12654191?v=4",
+ "profile": "https://github.com/danielawitten",
+ "contributions": [
+ "code",
+ "content"
+ ]
+ },
+ {
+ "login": "trevorhastie",
+ "name": "trevorhastie",
+ "avatar_url": "https://avatars.githubusercontent.com/u/13293253?v=4",
+ "profile": "https://web.stanford.edu/~hastie/",
+ "contributions": [
+ "code",
+ "content"
+ ]
+ },
+ {
+ "login": "tibshirani",
+ "name": "tibshirani",
+ "avatar_url": "https://avatars.githubusercontent.com/u/2848609?v=4",
+ "profile": "https://github.com/tibshirani",
+ "contributions": [
+ "code",
+ "content"
+ ]
+ }
+ ],
+ "contributorsPerLine": 7,
+ "skipCi": true,
+ "repoType": "github",
+ "repoHost": "https://github.com",
+ "projectName": "ISLP",
+ "projectOwner": "intro-stat-learning"
+}
diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
new file mode 100644
index 0000000..9260015
--- /dev/null
+++ b/.github/workflows/build_docs.yml
@@ -0,0 +1,85 @@
+# This builds and deploys ISLP docs
+
+name: Build docs
+
+# Controls when the workflow will run
+on:
+ workflow_dispatch:
+ inputs: null
+
+# A workflow run is made up of one or more jobs that can run
+# sequentially or in parallel
+
+jobs: # This workflow contains a single
+ # job called "build"
+
+ build:
+ # The type of runner that the job will run on
+ runs-on: ubuntu-latest
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.12'
+ cache: 'pip'
+ # Install
+ - name: Install dependencies
+ run: |
+ sudo apt-get install r-base
+ pip install -r docs/requirements.txt
+ pip install .
+
+ # Checkout labs
+ - name: Checkout version of labs
+ run: |
+ git submodule update --init --force docs/ISLP_labs
+ cd docs
+ mkdir -p source/labs
+ cp ISLP_labs/Ch*nb source/labs
+ python fix_and_clear_notebooks.py source/labs/Ch*nb --rm_md
+ python make_notebooks.py --inplace --requirements=ISLP_labs/requirements.txt source/labs/Ch06-varselect-lab.ipynb
+ rm source/labs/Ch*md
+
+ - name: Make docs
+ run: |
+ cd docs
+ make html
+
+ # Store the output
+ - name: Upload docs
+ uses: actions/upload-artifact@v4
+ with:
+ name: ISLP_docs
+ path: docs/build/html
+ retention-days: 5
+
+ deploy:
+ runs-on: ubuntu-latest
+ needs: build
+
+ # Grant GITHUB_TOKEN the permissions required to make a Pages deployment
+ permissions:
+ pages: write # to deploy to Pages
+ id-token: write # to verify the deployment originates from an appropriate source
+
+ environment:
+ name: github-pages
+ url: ${{steps.deployment.outputs.page_url}}
+
+ steps:
+ - uses: actions/download-artifact@master
+ with:
+ name: ISLP_docs
+ path: .
+ - uses: actions/configure-pages@v4
+ with:
+ node-version: 20.x
+ - uses: actions/upload-pages-artifact@v3
+ with:
+ node-version: 20.x
+ path: .
+ - id: deployment
+ uses: actions/deploy-pages@main
\ No newline at end of file
diff --git a/.github/workflows/build_notebook.yml b/.github/workflows/build_notebook.yml
new file mode 100644
index 0000000..dbf97e8
--- /dev/null
+++ b/.github/workflows/build_notebook.yml
@@ -0,0 +1,105 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Build a notebook
+
+# Controls when the workflow will run
+on:
+ workflow_dispatch:
+ inputs:
+ LABS:
+ description: 'Labs version'
+ required: true
+ default: 'v2'
+ type: string
+ ID:
+ description: 'Which lab to build'
+ required: true
+ default: '03'
+ type: string
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+ # This workflow contains a single job called "build"
+ build-linux:
+ # The type of runner that the job will run on
+ runs-on: ubuntu-latest
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ cache: 'pip'
+
+ # Install
+ - name: Install dependencies
+ run: |
+ pip install .
+ pip install jupyterlab
+
+ # Runs a set of commands using the runners shell
+ - name: Build notebook
+ env:
+ LABS: ${{ inputs.LABS }}
+ ID: ${{ inputs.ID }}
+ run: |
+ git clone https://github.com/intro-stat-learning/ISLP_labs.git
+ cd ISLP_labs
+ git checkout $LABS
+ cp Ch*$ID*lab.ipynb ..
+ jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb
+ jupyter nbconvert --to html ../Ch*$ID*lab.ipynb
+
+ # Store the output
+ - name: Upload labs
+ env:
+ ID: ${{ inputs.ID }}
+ uses: actions/upload-artifact@v3
+ with:
+ name: ISLP_labs
+ path: Ch*
+ retention-days: 1
+
+ build-mac:
+ # The type of runner that the job will run on
+ runs-on: macos-latest
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ cache: 'pip'
+
+ # Install
+ - name: Install dependencies
+ run: |
+ pip install .
+ pip install jupyterlab
+
+ # Runs a set of commands using the runners shell
+ - name: Build notebook
+ env:
+ LABS: ${{ inputs.LABS }}
+ ID: ${{ inputs.ID }}
+ run: |
+ git clone https://github.com/intro-stat-learning/ISLP_labs.git
+ cd ISLP_labs
+ git checkout $LABS
+ cp Ch*$ID*lab.ipynb ..
+ jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb
+ jupyter nbconvert --to html ../Ch*$ID*lab.ipynb
+
+ # Store the output
+ - name: Upload labs
+ env:
+ ID: ${{ inputs.ID }}
+ uses: actions/upload-artifact@v3
+ with:
+ name: ISLP_labs
+ path: Ch*
+ retention-days: 1
\ No newline at end of file
diff --git a/.github/workflows/build_notebook_errors.yml b/.github/workflows/build_notebook_errors.yml
new file mode 100644
index 0000000..d5fabee
--- /dev/null
+++ b/.github/workflows/build_notebook_errors.yml
@@ -0,0 +1,104 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Build a notebook (allow errors, capture result)
+
+# Controls when the workflow will run
+on:
+ workflow_dispatch:
+ inputs:
+ LABS:
+ description: 'Labs version'
+ required: true
+ default: 'v2'
+ type: string
+ ID:
+ description: 'Which lab to build'
+ required: true
+ default: '02'
+ type: string
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+
+ build-linux:
+ # The type of runner that the job will run on
+ runs-on: ubuntu-latest
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ cache: 'pip'
+
+ # Install
+ - name: Install dependencies
+ run: |
+ pip install .
+
+ # Runs a set of commands using the runners shell
+ - name: Build notebook, allowing errors
+ env:
+ LABS: ${{ inputs.LABS }}
+ ID: ${{ inputs.ID }}
+ run: |
+ git clone https://github.com/intro-stat-learning/ISLP_labs.git
+ cd ISLP_labs
+ git checkout $LABS
+ cp Ch*$ID*lab.ipynb ..
+ jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb --allow-errors
+ jupyter nbconvert --to html ../Ch*$ID*lab.ipynb
+
+ # Store the output
+ - name: Upload labs
+ env:
+ ID: ${{ inputs.ID }}
+ uses: actions/upload-artifact@v3
+ with:
+ name: ISLP_labs
+ path: Ch*
+ retention-days: 1
+
+ build-mac:
+ # The type of runner that the job will run on
+ runs-on: macos-latest
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ cache: 'pip'
+
+ # Install
+ - name: Install dependencies
+ run: |
+ pip install .
+
+ # Runs a set of commands using the runners shell
+ - name: Build notebook, allowing errors
+ env:
+ LABS: ${{ inputs.LABS }}
+ ID: ${{ inputs.ID }}
+ run: |
+ git clone https://github.com/intro-stat-learning/ISLP_labs.git
+ cd ISLP_labs
+ git checkout $LABS
+ cp Ch*$ID*lab.ipynb ..
+ jupyter nbconvert --execute --inplace ../Ch*$ID*lab.ipynb --allow-errors
+ jupyter nbconvert --to html ../Ch*$ID*lab.ipynb
+
+ # Store the output
+ - name: Upload labs
+ env:
+ ID: ${{ inputs.ID }}
+ uses: actions/upload-artifact@v3
+ with:
+ name: ISLP_labs
+ path: Ch*
+ retention-days: 1
+
diff --git a/.github/workflows/build_save_labs.yml b/.github/workflows/build_save_labs.yml
new file mode 100644
index 0000000..57ebf78
--- /dev/null
+++ b/.github/workflows/build_save_labs.yml
@@ -0,0 +1,104 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Build + save notebooks (not 10,13)
+
+# Controls when the workflow will run
+on:
+ workflow_dispatch:
+ inputs:
+ LABS:
+ description: 'Labs version'
+ required: true
+ default: 'v2'
+ type: string
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+ # This workflow contains a single job called "build"
+ build:
+ # The type of runner that the job will run on
+ runs-on: ubuntu-latest
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ cache: 'pip'
+
+ # Install
+ - name: Install dependencies
+ run: |
+ pip install .
+
+ # Runs a set of commands using the runners shell
+ - name: Build Ch02 notebook (allow errors)
+ env:
+ LABS: ${{ inputs.LABS }}
+ run: |
+ git clone https://github.com/intro-stat-learning/ISLP_labs.git
+ cd ISLP_labs
+ git checkout $LABS
+ rm Ch10*
+ rm Ch13*
+ jupyter nbconvert --execute --inplace --allow-errors Ch02*lab.ipynb
+
+ - name: Build Ch03 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch03*lab.ipynb
+
+ - name: Build Ch04 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch04*lab.ipynb
+
+ - name: Build Ch05 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch05*lab.ipynb
+
+ - name: Build Ch06 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch06*lab.ipynb
+
+ - name: Build Ch07 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch07*lab.ipynb
+
+ - name: Build Ch08 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch08*lab.ipynb
+
+ - name: Build Ch09 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch09*lab.ipynb
+
+ - name: Build Ch11 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch11*lab.ipynb
+
+ - name: Build Ch12 notebook
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --execute --inplace Ch12*lab.ipynb
+
+ - name: Build HTML
+ run: |
+ cd ISLP_labs
+ jupyter nbconvert --to html Ch*ipynb
+
+ # Store the output
+ - name: Upload labs
+ uses: actions/upload-artifact@v3
+ with:
+ name: ISLP_labs
+ path: Ch*
+ retention-days: 1
\ No newline at end of file
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
new file mode 100644
index 0000000..767a62e
--- /dev/null
+++ b/.github/workflows/build_test.yml
@@ -0,0 +1,102 @@
+name: Build and test
+
+on: [push]
+
+jobs:
+ build-linux:
+ runs-on: ubuntu-latest
+ strategy:
+ max-parallel: 5
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.12'
+ - name: Add conda to system path
+ run: |
+ # $CONDA is an environment variable pointing to the root of the miniconda directory
+ echo $CONDA/bin >> $GITHUB_PATH
+ - name: Install dependencies
+ run: |
+ pip install .
+ - name: Lint with flake8
+ run: |
+ pip install flake8
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ timeout-minutes: 12
+ run: |
+ pip install torchvision torchinfo
+ pip install pytest
+ pytest
+
+ build-windows:
+ runs-on: windows-latest
+ strategy:
+ max-parallel: 5
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.12'
+ - name: Add conda to system path
+ run: |
+ # $CONDA is an environment variable pointing to the root of the miniconda directory
+ echo $CONDA/bin >> $GITHUB_PATH
+ - name: Install dependencies
+ run: |
+ pip install .
+ - name: Lint with flake8
+ run: |
+ pip install flake8
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ timeout-minutes: 12
+ run: |
+ pip install torchvision torchinfo
+ pip install pytest
+ pytest
+
+ build-mac:
+ runs-on: macos-latest
+ strategy:
+ max-parallel: 5
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.12'
+ - name: Add conda to system path
+ run: |
+ # $CONDA is an environment variable pointing to the root of the miniconda directory
+ echo $CONDA/bin >> $GITHUB_PATH
+ - name: Install dependencies
+ run: |
+ pip install .
+ - name: Lint with flake8
+ run: |
+ pip install flake8
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ timeout-minutes: 12
+ run: |
+ pip install torchvision torchinfo
+ pip install pytest
+ pytest --ignore tests/deeplearning/test_hitters.py --ignore tests/deeplearning/test_mnist.py
+
+
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..891a60a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "docs/ISLP_labs"]
+ path = docs/ISLP_labs
+ url = https://github.com/intro-stat-learning/ISLP_labs
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index aacca4b..44bfa25 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,7 +9,15 @@ version: 2
build:
os: ubuntu-22.04
tools:
- python: "3.9"
+ python: "3.11"
+ apt_packages:
+ - r-base
+ jobs:
+ pre_build:
+ - python docs/fix_and_clear_notebooks.py
+
+submodules:
+ include: all
# Build documentation in the docs/ directory with Sphinx
sphinx:
@@ -22,8 +30,7 @@ sphinx:
# Optionally declare the Python requirements required to build your docs
python:
install:
- - requirements: requirements.txt
+ - requirements: docs/ISLP_labs/requirements.txt
- requirements: docs/requirements.txt
- - requirements: torch_requirements.txt
- method: pip
path: .
diff --git a/ISLP/__init__.py b/ISLP/__init__.py
index ae230d3..6cd1ee1 100644
--- a/ISLP/__init__.py
+++ b/ISLP/__init__.py
@@ -6,28 +6,74 @@
"""
from os.path import join as pjoin
+from importlib.resources import (as_file,
+ files)
import pandas as pd, numpy as np
-from pkg_resources import resource_filename
+from sklearn.metrics import confusion_matrix as _confusion_matrix
+from sklearn.metrics._classification import unique_labels
# data originally saved via: [sm.datasets.get_rdataset(n, 'ISLR').data.to_csv('../ISLP/data/%s.csv' % n, index=False) for n in ['Carseats', 'College', 'Credit', 'Default', 'Hitters', 'Auto', 'OJ', 'Portfolio', 'Smarket', 'Wage', 'Weekly', 'Caravan']]
+def _make_categorical(dataset):
+ unordered = _unordered.setdefault(dataset, [])
+ ordered = _ordered.setdefault(dataset, [])
+ with as_file(files('ISLP').joinpath('data', '%s.csv' % dataset)) as filename:
+ df = pd.read_csv(filename)
+ for col in unordered:
+ df[col] = pd.Categorical(df[col])
+ for col in ordered:
+ df[col] = pd.Categorical(df[col], ordered=True)
+ if dataset in _index:
+ df = df.set_index(_index[dataset])
+ return df
+
+_unordered = {'Hitters':['League', 'Division', 'NewLeague'],
+ 'Caravan':['Purchase'],
+ 'Carseats':['ShelveLoc', 'Urban', 'US'],
+ 'College':['Private'],
+ 'Publication':['mech'],
+ 'BrainCancer':['sex', 'diagnosis', 'loc', 'stereo'],
+ 'Wage':['maritl', 'race', 'region', 'jobclass', 'health', 'health_ins'],
+ 'Default':['default', 'student'],
+ 'Credit':['Gender', 'Student', 'Married', 'Ethnicity'],
+ 'OJ':['Purchase', 'Store7'],
+ 'Smarket':['Direction'],
+ 'Weekly':['Direction']
+ }
+_ordered = {'Wage':['education'],
+ }
+_index = {'Auto':'name'}
+
+_datasets = sorted(list(_unordered.keys()) +
+ list(_ordered.keys()) +
+ ['NCI60',
+ 'Khan',
+ 'Bikeshare',
+ 'NYSE'])
+
def load_data(dataset):
+
if dataset == 'NCI60':
- features = resource_filename('ISLP', pjoin('data', 'NCI60data.npy'))
- X = np.load(features)
- labels = resource_filename('ISLP', pjoin('data', 'NCI60labs.csv'))
- Y = pd.read_csv(labels)
+ with as_file(files('ISLP').joinpath('data', 'NCI60data.npy')) as features:
+ X = np.load(features)
+ with as_file(files('ISLP').joinpath('data', 'NCI60labs.csv')) as labels:
+ Y = pd.read_csv(labels)
return {'data':X, 'labels':Y}
elif dataset == 'Khan':
- xtest = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_xtest.csv')))
+ with as_file(files('ISLP').joinpath('data', 'Khan_xtest.csv')) as xtest:
+ xtest = pd.read_csv(xtest)
xtest = xtest.rename(columns=dict([('V%d' % d, 'G%04d' % d) for d in range(1, len(xtest.columns)+0)]))
- ytest = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_ytest.csv')))
+ with as_file(files('ISLP').joinpath('data', 'Khan_ytest.csv')) as ytest:
+ ytest = pd.read_csv(ytest)
ytest = ytest.rename(columns={'x':'Y'})
ytest = ytest['Y']
- xtrain = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_xtrain.csv')))
- xtrain = xtrain.rename(columns=dict([('V%d' % d, 'G%04d' % d) for d in range(1, len(xtest.columns)+0)]))
- ytrain = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Khan_ytrain.csv')))
+ with as_file(files('ISLP').joinpath('data', 'Khan_xtrain.csv')) as xtrain:
+ xtrain = pd.read_csv(xtrain)
+ xtrain = xtrain.rename(columns=dict([('V%d' % d, 'G%04d' % d) for d in range(1, len(xtest.columns)+0)]))
+
+ with as_file(files('ISLP').joinpath('data', 'Khan_ytrain.csv')) as ytrain:
+ ytrain = pd.read_csv(ytrain)
ytrain = ytrain.rename(columns={'x':'Y'})
ytrain = ytrain['Y']
@@ -35,35 +81,10 @@ def load_data(dataset):
'xtrain':xtrain,
'ytest':ytest,
'ytrain':ytrain}
- elif dataset == 'Hitters':
- filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset))
- df = pd.read_csv(filename)
- for col in ['League', 'Division', 'NewLeague']:
- df[col] = pd.Categorical(df[col])
- return df
- elif dataset == 'Carseats':
- filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset))
- df = pd.read_csv(filename)
- for col in ['ShelveLoc', 'Urban', 'US']:
- df[col] = pd.Categorical(df[col])
- return df
- elif dataset == 'NYSE':
- filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset))
- df = pd.read_csv(filename).set_index('date')
- return df
- elif dataset == 'Publication':
- df = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Publication.csv')))
- for col in ['mech']:
- df[col] = pd.Categorical(df[col])
- return df
- elif dataset == 'BrainCancer':
- df = pd.read_csv(resource_filename('ISLP', pjoin('data', 'BrainCancer.csv')))
- for col in ['sex', 'diagnosis', 'loc', 'stereo']:
- df[col] = pd.Categorical(df[col])
- return df
+
elif dataset == 'Bikeshare':
- filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset))
- df = pd.read_csv(filename)
+ with as_file(files('ISLP').joinpath('data', '%s.csv' % dataset)) as filename:
+ df = pd.read_csv(filename)
df['weathersit'] = pd.Categorical(df['weathersit'], ordered=False)
# setting order to avoid alphabetical
df['mnth'] = pd.Categorical(df['mnth'],
@@ -78,26 +99,60 @@ def load_data(dataset):
ordered=False,
categories=range(24))
return df
- elif dataset == 'Wage':
- df = pd.read_csv(resource_filename('ISLP', pjoin('data', 'Wage.csv')))
- df['education'] = pd.Categorical(df['education'], ordered=True)
- return df
+ elif dataset == 'NYSE':
+ with as_file(files('ISLP').joinpath('data', '%s.csv' % dataset)) as filename:
+ df = pd.read_csv(filename)
+ # setting order to avoid alphabetical
+ df['day_of_week'] = pd.Categorical(df['day_of_week'],
+ ordered=False,
+ categories=['mon',
+ 'tues',
+ 'wed',
+ 'thur',
+ 'fri'])
+ return df.set_index('date')
else:
- filename = resource_filename('ISLP', pjoin('data', '%s.csv' % dataset))
- return pd.read_csv(filename)
+ return _make_categorical(dataset)
+load_data.__doc__ = f"""
+Load dataset from ISLP package.
-from sklearn.metrics import confusion_matrix as _confusion_matrix
+Choices are: {_datasets}
+
+Parameters
+----------
+
+dataset: str
+
+Returns
+-------
+
+data: array-like or dict
+ Either a `pd.DataFrame` representing the dataset or a dictionary
+ containing different parts of the dataset.
+
+"""
def confusion_table(predicted_labels,
- true_labels):
+ true_labels,
+ labels=None):
"""
Return a data frame version of confusion
matrix with rows given by predicted label
and columns the truth.
+
+ Parameters
+ ----------
+
+ predicted_labels: array-like
+ These will form rows of confusion matrix.
+
+ true_labels: array-like
+ These will form columns of confusion matrix.
"""
- labels = sorted(np.unique(list(true_labels) +
- list(predicted_labels)))
+ if labels is None:
+ labels = unique_labels(true_labels,
+ predicted_labels)
C = _confusion_matrix(true_labels,
predicted_labels,
labels=labels)
@@ -109,3 +164,4 @@ def confusion_table(predicted_labels,
from . import _version
__version__ = _version.get_versions()['version']
+
diff --git a/ISLP/_version.py b/ISLP/_version.py
index 9b01ea2..c2d7406 100644
--- a/ISLP/_version.py
+++ b/ISLP/_version.py
@@ -5,8 +5,9 @@
# directories (produced by setup.py build) will contain a much shorter file
# that just contains the computed version number.
-# This file is released into the public domain. Generated by
-# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer)
+# This file is released into the public domain.
+# Generated by versioneer-0.29
+# https://github.com/python-versioneer/python-versioneer
"""Git implementation of _version.py."""
@@ -15,10 +16,11 @@
import re
import subprocess
import sys
-from typing import Callable, Dict
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import functools
-def get_keywords():
+def get_keywords() -> Dict[str, str]:
"""Get the keywords needed to look up the version information."""
# these strings will be replaced by git during git-archive.
# setup.py/versioneer.py will grep for the variable names, so they must
@@ -34,8 +36,15 @@ def get_keywords():
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
+ VCS: str
+ style: str
+ tag_prefix: str
+ parentdir_prefix: str
+ versionfile_source: str
+ verbose: bool
-def get_config():
+
+def get_config() -> VersioneerConfig:
"""Create, populate and return the VersioneerConfig() object."""
# these strings are filled in when 'setup.py versioneer' creates
# _version.py
@@ -57,9 +66,9 @@ class NotThisMethod(Exception):
HANDLERS: Dict[str, Dict[str, Callable]] = {}
-def register_vcs_handler(vcs, method): # decorator
+def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
"""Create decorator to mark a method as the handler of a VCS."""
- def decorate(f):
+ def decorate(f: Callable) -> Callable:
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
@@ -68,11 +77,25 @@ def decorate(f):
return decorate
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
- env=None):
+def run_command(
+ commands: List[str],
+ args: List[str],
+ cwd: Optional[str] = None,
+ verbose: bool = False,
+ hide_stderr: bool = False,
+ env: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[str], Optional[int]]:
"""Call the given command(s)."""
assert isinstance(commands, list)
process = None
+
+ popen_kwargs: Dict[str, Any] = {}
+ if sys.platform == "win32":
+ # This hides the console window if pythonw.exe is used
+ startupinfo = subprocess.STARTUPINFO()
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ popen_kwargs["startupinfo"] = startupinfo
+
for command in commands:
try:
dispcmd = str([command] + args)
@@ -80,10 +103,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
process = subprocess.Popen([command] + args, cwd=cwd, env=env,
stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr
- else None))
+ else None), **popen_kwargs)
break
- except OSError:
- e = sys.exc_info()[1]
+ except OSError as e:
if e.errno == errno.ENOENT:
continue
if verbose:
@@ -103,7 +125,11 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
return stdout, process.returncode
-def versions_from_parentdir(parentdir_prefix, root, verbose):
+def versions_from_parentdir(
+ parentdir_prefix: str,
+ root: str,
+ verbose: bool,
+) -> Dict[str, Any]:
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
@@ -128,13 +154,13 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
+def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
- keywords = {}
+ keywords: Dict[str, str] = {}
try:
with open(versionfile_abs, "r") as fobj:
for line in fobj:
@@ -156,7 +182,11 @@ def git_get_keywords(versionfile_abs):
@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
+def git_versions_from_keywords(
+ keywords: Dict[str, str],
+ tag_prefix: str,
+ verbose: bool,
+) -> Dict[str, Any]:
"""Get version information from git keywords."""
if "refnames" not in keywords:
raise NotThisMethod("Short version file found")
@@ -220,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
+def git_pieces_from_vcs(
+ tag_prefix: str,
+ root: str,
+ verbose: bool,
+ runner: Callable = run_command
+) -> Dict[str, Any]:
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
@@ -228,13 +263,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
version string, meaning we're inside a checked out source tree.
"""
GITS = ["git"]
- TAG_PREFIX_REGEX = "*"
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
- TAG_PREFIX_REGEX = r"\*"
+
+ # GIT_DIR can interfere with correct operation of Versioneer.
+ # It may be intended to be passed to the Versioneer-versioned project,
+ # but that should not change where we get our version from.
+ env = os.environ.copy()
+ env.pop("GIT_DIR", None)
+ runner = functools.partial(runner, env=env)
_, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
- hide_stderr=True)
+ hide_stderr=not verbose)
if rc != 0:
if verbose:
print("Directory %s not under git control" % root)
@@ -242,11 +282,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
- describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty",
- "--always", "--long",
- "--match",
- "%s%s" % (tag_prefix, TAG_PREFIX_REGEX)],
- cwd=root)
+ describe_out, rc = runner(GITS, [
+ "describe", "--tags", "--dirty", "--always", "--long",
+ "--match", f"{tag_prefix}[[:digit:]]*"
+ ], cwd=root)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
@@ -256,7 +295,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
- pieces = {}
+ pieces: Dict[str, Any] = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
@@ -335,8 +374,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
else:
# HEX: no tags
pieces["closest-tag"] = None
- count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
- pieces["distance"] = int(count_out) # total number of commits
+ out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+ pieces["distance"] = len(out.split()) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
@@ -348,14 +387,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
return pieces
-def plus_or_dot(pieces):
+def plus_or_dot(pieces: Dict[str, Any]) -> str:
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
-def render_pep440(pieces):
+def render_pep440(pieces: Dict[str, Any]) -> str:
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
@@ -380,7 +419,7 @@ def render_pep440(pieces):
return rendered
-def render_pep440_branch(pieces):
+def render_pep440_branch(pieces: Dict[str, Any]) -> str:
"""TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
The ".dev0" means not master branch. Note that .dev0 sorts backwards
@@ -410,7 +449,7 @@ def render_pep440_branch(pieces):
return rendered
-def pep440_split_post(ver):
+def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
"""Split pep440 version string at the post-release segment.
Returns the release segments before the post-release and the
@@ -420,7 +459,7 @@ def pep440_split_post(ver):
return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
-def render_pep440_pre(pieces):
+def render_pep440_pre(pieces: Dict[str, Any]) -> str:
"""TAG[.postN.devDISTANCE] -- No -dirty.
Exceptions:
@@ -432,7 +471,7 @@ def render_pep440_pre(pieces):
tag_version, post_version = pep440_split_post(pieces["closest-tag"])
rendered = tag_version
if post_version is not None:
- rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"])
+ rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
else:
rendered += ".post0.dev%d" % (pieces["distance"])
else:
@@ -444,7 +483,7 @@ def render_pep440_pre(pieces):
return rendered
-def render_pep440_post(pieces):
+def render_pep440_post(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
@@ -471,7 +510,7 @@ def render_pep440_post(pieces):
return rendered
-def render_pep440_post_branch(pieces):
+def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
The ".dev0" means not master branch.
@@ -500,7 +539,7 @@ def render_pep440_post_branch(pieces):
return rendered
-def render_pep440_old(pieces):
+def render_pep440_old(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
@@ -522,7 +561,7 @@ def render_pep440_old(pieces):
return rendered
-def render_git_describe(pieces):
+def render_git_describe(pieces: Dict[str, Any]) -> str:
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
@@ -542,7 +581,7 @@ def render_git_describe(pieces):
return rendered
-def render_git_describe_long(pieces):
+def render_git_describe_long(pieces: Dict[str, Any]) -> str:
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
@@ -562,7 +601,7 @@ def render_git_describe_long(pieces):
return rendered
-def render(pieces, style):
+def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {"version": "unknown",
@@ -598,7 +637,7 @@ def render(pieces, style):
"date": pieces.get("date")}
-def get_versions():
+def get_versions() -> Dict[str, Any]:
"""Get version information or return default if unable to do so."""
# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
# __file__, we can work backwards from there to the root. Some
diff --git a/ISLP/bart/bart.py b/ISLP/bart/bart.py
index 2c33aba..3c933ca 100644
--- a/ISLP/bart/bart.py
+++ b/ISLP/bart/bart.py
@@ -101,11 +101,11 @@ def predict(self,
check_is_fitted(self)
nsample = len(self.trees_sample_)
- output = np.zeros(X.shape[0], np.float)
+ output = np.zeros(X.shape[0], float)
for trees in self.trees_sample_:
for tree in trees:
- tree_fit = np.array([tree.predict_out_of_sample(x) for x in X])
+ tree_fit = np.array([tree.predict_out_of_sample(x) for x in np.asarray(X)])
output += tree_fit
output = output / nsample
return self._inverse(output)
@@ -118,11 +118,11 @@ def staged_predict(self,
trees_sample_ = self.trees_sample_[start_idx:]
nsample = len(trees_sample_)
- output = np.zeros((nsample, X.shape[0]), np.float)
+ output = np.zeros((nsample, X.shape[0]), float)
for nstep, trees in enumerate(trees_sample_):
for tree in trees:
- tree_fit = np.array([tree.predict_out_of_sample(x) for x in X])
+ tree_fit = np.array([tree.predict_out_of_sample(x) for x in np.asarray(X)])
output[nstep] += tree_fit
output = np.cumsum(output, 0) / (np.arange(nsample) + 1)[:,None]
@@ -141,7 +141,7 @@ def fit(self,
if self.n_jobs <= 0:
n_jobs = 1
- random_idx = random_state.randint(0,2**32-1,size=(n_jobs,))
+ random_idx = random_state.randint(0,2**30-1,size=(n_jobs,)) # 2**31-1 should be OK for int32
parallel = Parallel(n_jobs=len(random_idx))
diff --git a/ISLP/bart/likelihood.py b/ISLP/bart/likelihood.py
index 28f341d..cfa3ce6 100644
--- a/ISLP/bart/likelihood.py
+++ b/ISLP/bart/likelihood.py
@@ -82,7 +82,7 @@ def marginal_loglikelihood(response,
if not incremental:
if responsesq_sum is None:
responsesq_sum = (response**2).sum()
- response_moments = (n, response_sum, responseseq_sum)
+ response_moments = (n, response_sum, responsesq_sum)
logL -= n * 0.5 * np.log(sigmasq)
logL -= 0.5 * responsesq_sum / sigmasq
diff --git a/ISLP/bart/tree.py b/ISLP/bart/tree.py
index 8726929..49b4789 100644
--- a/ISLP/bart/tree.py
+++ b/ISLP/bart/tree.py
@@ -96,7 +96,7 @@ def predict_output(self):
current_node = self.get_node(node_index)
output[current_node.idx_data_points] = current_node.value
- return output.astype(np.float)
+ return output.astype(float)
def predict_out_of_sample(self, X):
"""
diff --git a/ISLP/info.py b/ISLP/info.py
deleted file mode 100644
index 3a1fecd..0000000
--- a/ISLP/info.py
+++ /dev/null
@@ -1,78 +0,0 @@
-""" This file contains defines parameters for regreg that we use to fill
-settings in setup.py, the regreg top-level docstring, and for building the docs.
-In setup.py in particular, we exec this file, so it cannot import regreg
-"""
-
-# regreg version information. An empty _version_extra corresponds to a
-# full release. '.dev' as a _version_extra string means this is a development
-# version
-_version_major = 0
-_version_minor = 2
-_version_micro = 0
-_version_extra = ''
-
-# Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z"
-__version__ = "%s.%s.%s%s" % (_version_major,
- _version_minor,
- _version_micro,
- _version_extra)
-
-CLASSIFIERS = ["Development Status :: 3 - Alpha",
- "Environment :: Console",
- "Intended Audience :: Science/Research",
- "License :: OSI Approved :: BSD License",
- "Operating System :: OS Independent",
- "Programming Language :: Python",
- "Topic :: Scientific/Engineering"]
-
-description = 'Testing a fixed value of lambda'
-
-# Note: this long_description is actually a copy/paste from the top-level
-# README.txt, so that it shows up nicely on PyPI. So please remember to edit
-# it only in one place and sync it correctly.
-long_description = \
-"""
-============
-Fixed lambda
-============
-
-This mini-package contains a module to perform
-a fixed lambda test for the LASSO.
-"""
-
-# versions
-NUMPY_MIN_VERSION='1.7.1'
-SCIPY_MIN_VERSION = '0.9'
-PANDAS_MIN_VERSION = "0.20"
-SKLEARN_MIN_VERSION = '1.0'
-STATSMODELS_MIN_VERSION = '0.13'
-MATPLOTLIB_MIN_VERSION = '3.3.3'
-
-NAME = 'ISLP'
-MAINTAINER = "Jonathan Taylor"
-MAINTAINER_EMAIL = ""
-DESCRIPTION = description
-LONG_DESCRIPTION = long_description
-URL = "http://github.org/jonathan.taylor/ISLP"
-DOWNLOAD_URL = ""
-LICENSE = "BSD license"
-CLASSIFIERS = CLASSIFIERS
-AUTHOR = "ISLP authors"
-AUTHOR_EMAIL = ""
-PLATFORMS = "OS Independent"
-MAJOR = _version_major
-MINOR = _version_minor
-MICRO = _version_micro
-ISRELEASE = _version_extra == ''
-VERSION = __version__
-STATUS = 'alpha'
-PROVIDES = []
-REQUIRES = ["numpy (>=%s)" % NUMPY_MIN_VERSION,
- "scipy (>=%s)" % SCIPY_MIN_VERSION,
- "statsmodels (>=%s)" % STATSMODELS_MIN_VERSION,
- "pandas (>=%s)" % PANDAS_MIN_VERSION,
- "sklearn (>=%s)" % SKLEARN_MIN_VERSION,
- "lifelines",
- "joblib",
- "pygam"
- ]
diff --git a/ISLP/models/__init__.py b/ISLP/models/__init__.py
index bf9cd55..cff02f8 100644
--- a/ISLP/models/__init__.py
+++ b/ISLP/models/__init__.py
@@ -4,14 +4,15 @@
"""
import numpy as np, pandas as pd
+from io import StringIO
from .model_spec import (ModelSpec,
Column,
- Variable,
+ Feature,
poly,
ns,
bs,
- derived_variable,
+ derived_feature,
pca,
contrast,
build_columns)
@@ -24,13 +25,14 @@
sklearn_selection_path)
def summarize(results,
- conf_int=False):
+ conf_int=False,
+ level=None):
"""
Take a fit statsmodels and summarize it
by returning the usual coefficient estimates,
their standard errors, the usual test
statistics and P-values as well as
- (optionally) 95% confidence intervals.
+ (optionally) confidence intervals.
Based on:
@@ -45,8 +47,12 @@ def summarize(results,
Include 95% confidence intervals?
"""
- tab = results.summary().tables[1]
- results_table = pd.read_html(tab.as_html(),
+ if level is not None:
+ conf_int = True
+ if level is None:
+ level = 0.95
+ tab = results.summary(alpha=1-level).tables[1]
+ results_table = pd.read_html(StringIO(tab.as_html()),
index_col=0,
header=0)[0]
if not conf_int:
@@ -57,12 +63,4 @@ def summarize(results,
return results_table[results_table.columns[:-2]]
return results_table
-# def poly(X, degree):
-# """
-# Create columns of design matrix
-# for orthogonal polynomial for a given series X
-# """
-
-# result = Poly(degree=degree).fit_transform(X)
-
diff --git a/ISLP/models/columns.py b/ISLP/models/columns.py
index c15ace2..7ea6adb 100644
--- a/ISLP/models/columns.py
+++ b/ISLP/models/columns.py
@@ -9,7 +9,6 @@
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError
-
class Column(NamedTuple):
"""
@@ -52,7 +51,7 @@ def get_columns(self, X, fit=False):
Column names
"""
- cols = _get_column(self.idx, X, ndarray=False)
+ cols = _get_column(self.idx, X)
if fit:
self.fit_encoder(X)
@@ -88,7 +87,7 @@ def fit_encoder(self, X):
-------
None
"""
- cols = _get_column(self.idx, X, ndarray=False)
+ cols = _get_column(self.idx, X)
if self.encoder is not None:
try:
check_is_fitted(self.encoder)
@@ -102,41 +101,30 @@ def fit_encoder(self, X):
def _get_column(idx,
X,
- twodim=False,
- loc=True,
- ndarray=True):
+ loc=True):
"""
- Extract column `idx` from `X`,
- optionally making it two-dimensional
- as many sklearn encoders assume
- two-dimensional input
+ Extract column `idx` from `X`
+ as a two-dimensional ndarray or a pd.DataFrame
"""
if isinstance(X, np.ndarray):
- col = X[:, idx]
+ col = X[:, [idx]]
elif hasattr(X, 'loc'):
if loc:
- col = X.loc[:, idx]
+ col = X.loc[:, [idx]]
else: # use iloc instead
- col = X.iloc[:, idx]
+ col = X.iloc[:, [idx]]
else:
raise ValueError('expecting an ndarray or a ' +
'"loc/iloc" methods, got %s' % str(X))
- if ndarray:
- if twodim and np.asarray(col).ndim == 1:
- return np.asarray(col).reshape((-1, 1))
- return np.asarray(col)
- else:
- return col
+
+ return col
def _get_column_info(X,
columns,
is_categorical,
is_ordinal,
- default_encoders={
- 'ordinal': OrdinalEncoder(),
- 'categorical': OneHotEncoder()
- }
+ categorical_encoders={}
):
@@ -158,13 +146,19 @@ def _get_column_info(X,
name = str(col)
if is_categorical[i]:
if is_ordinal[i]:
- Xcol = _get_column(col, X, twodim=True)
- encoder = clone(default_encoders['ordinal'])
+ Xcol = _get_column(col, X)
+ if col not in categorical_encoders:
+ encoder = clone(categorical_encoders['ordinal'])
+ else:
+ encoder = categorical_encoders[col]
encoder.fit(Xcol)
columns = ['{0}'.format(col)]
else:
- Xcol = _get_column(col, X, twodim=True, ndarray=True)
- encoder = clone(default_encoders['categorical'])
+ Xcol = _get_column(col, X)
+ if col not in categorical_encoders:
+ encoder = clone(categorical_encoders['categorical'])
+ else:
+ encoder = categorical_encoders[col]
cols = encoder.fit_transform(Xcol)
if hasattr(encoder, 'columns_'):
columns_ = encoder.columns_
@@ -179,7 +173,7 @@ def _get_column_info(X,
tuple(columns),
encoder)
else:
- Xcol = _get_column(col, X, twodim=True)
+ Xcol = _get_column(col, X)
column_info[col] = Column(col,
name,
columns=(name,))
@@ -189,7 +183,6 @@ def _get_column_info(X,
# https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
# max_bins is ignored
-
def _check_categories(categorical_features, X):
"""Check and validate categorical features in X
diff --git a/ISLP/models/generic_selector.py b/ISLP/models/generic_selector.py
index b0261e9..7c9329e 100644
--- a/ISLP/models/generic_selector.py
+++ b/ISLP/models/generic_selector.py
@@ -28,7 +28,10 @@
import scipy as sp
from sklearn.metrics import get_scorer
-from sklearn.base import (clone, MetaEstimatorMixin)
+from sklearn.base import (clone,
+ MetaEstimatorMixin,
+ is_classifier,
+ is_regressor)
from sklearn.model_selection import cross_val_score
from joblib import Parallel, delayed
@@ -149,13 +152,13 @@ def __init__(self,
self.scoring = scoring
if scoring is None:
- if self.est_._estimator_type == 'classifier':
+ if is_classifier(self.est_):
scoring = 'accuracy'
- elif self.est_._estimator_type == 'regressor':
+ elif is_regressor(self.est_):
scoring = 'r2'
else:
- raise AttributeError('Estimator must '
- 'be a Classifier or Regressor.')
+ scoring = None
+
if isinstance(scoring, str):
self.scorer = get_scorer(scoring)
else:
@@ -486,6 +489,9 @@ def _calc_score(estimator,
pre_dispatch='2*n_jobs',
**fit_params):
+ if scorer is None:
+ scorer = lambda estimator, X, y: estimator.score(X, y)
+
X_state = build_submodel(X, state)
if cv:
diff --git a/ISLP/models/model_spec.py b/ISLP/models/model_spec.py
index c5be3f9..d970bb7 100644
--- a/ISLP/models/model_spec.py
+++ b/ISLP/models/model_spec.py
@@ -35,7 +35,7 @@
DOCACHE = False
-class Variable(NamedTuple):
+class Feature(NamedTuple):
"""
An element in a model matrix that will build
@@ -49,29 +49,64 @@ class Variable(NamedTuple):
pure_columns: bool=False
override_encoder_colnames: bool=False
+
#### contrast specific code
class Contrast(TransformerMixin, BaseEstimator):
- """
- Contrast encoding for categorical variables.
- """
def __init__(self,
method='drop',
drop_level=None):
+ """
+ Contrast encoding for categorical variables.
+
+ Parameters
+ ----------
+ method : ['drop', 'sum', None, callable]
+ If 'drop', then a column of the one-hot
+ encoding will be dropped. If 'sum', then the sum of
+ coefficients is constrained to sum to 1.
+ If `None`, the full one-hot encoding is returned.
+ Finally, if callable, then it should take the number of
+ levels of the category as a single argument and return
+ an appropriate contrast of the full one-hot encoding.
+
+ drop_level : str (optional)
+ If not None, this level of the category
+ will be dropped if `method=='drop'`.
+
+ """
self.method = method
self.drop_level = drop_level
- def fit(self, X):
+ def fit(self, X, y=None):
+
+ """
+ Construct contrast of categorical variable
+ for use in building a design matrix.
+
+ Parameters
+ ----------
+ X : array-like
+ X on which model matrix will be evaluated.
+ If a :py:class:`pd.DataFrame` or :py:class:`pd.Series`, variables that are of
+ categorical dtype will be treated as categorical.
+
+ Returns
+ -------
+ F : array-like
+ Columns of design matrix implied by the
+ categorical variable.
+
+ """
Xa = np.asarray(X).reshape((-1,1))
self.encoder_ = OneHotEncoder(drop=None,
- sparse=False).fit(Xa)
+ sparse_output=False).fit(Xa)
cats = self.encoder_.categories_[0]
column_names = [str(n) for n in cats]
-
if isinstance(X, pd.DataFrame): # expecting a column, we take .iloc[:,0]
X = X.iloc[:,0]
@@ -98,7 +133,7 @@ def fit(self, X):
if self.method == 'drop':
self.columns_ = [column_names[j] for j in colmap]
self.contrast_matrix_ = np.identity(len(cats))
- keep = np.ones(len(cats), np.bool)
+ keep = np.ones(len(cats), bool)
keep[drop_idx] = 0
self.contrast_matrix_ = self.contrast_matrix_[:,keep]
self.contrast_matrix_ = self.contrast_matrix_[:,colmap]
@@ -119,6 +154,7 @@ def fit(self, X):
raise ValueError('method must be one of ["drop", "sum", None] or a callable' +
'that returns a contrast matrix and column names given the number' +
' of levels')
+
return self
def transform(self, X):
@@ -136,22 +172,23 @@ def transform(self, X):
class ModelSpec(TransformerMixin, BaseEstimator):
- '''
-
- Parameters
+ '''Parameters
----------
terms : sequence (optional)
+
Sequence of sets whose
elements are columns of *X* when fit.
For :py:class:`pd.DataFrame` these can be column
names.
intercept : bool (optional)
+
Include a column for intercept?
categorical_features : array-like of {bool, int} of shape (n_features)
or shape (n_categorical_features,), default=None.
+
Indicates the categorical features. Will be ignored if *X* is a :py:class:`pd.DataFrame`
or :py:class:`pd.Series`.
@@ -160,25 +197,31 @@ class ModelSpec(TransformerMixin, BaseEstimator):
- integer array-like : integer indices indicating categorical
features.
- default_encoders : dict
- Dictionary whose keys are elements of *terms* and values
- are transforms to be applied to the associate columns in the model matrix
- by running the *fit_transform* method when *fit* is called and overwriting
- these values in the dictionary.
+ categorical_encoders : dict
+
+ Dictionary whose keys are elements of *terms* that represent
+ **categorical variables**. Its values are transforms to be
+ applied to the associate columns in the model matrix by
+ running the *fit_transform* method when *fit* is called and
+ overwriting these values in the dictionary.
+
'''
def __init__(self,
terms=[],
intercept=True,
categorical_features=None,
- default_encoders={'categorical': Contrast(method='drop'),
- 'ordinal': OrdinalEncoder()}
+ categorical_encoders={}
):
self.intercept = intercept
self.terms = terms
self.categorical_features = categorical_features
- self.default_encoders = default_encoders
+
+ self.categorical_encoders = categorical_encoders
+ self.categorical_encoders_ = {'ordinal': OrdinalEncoder(),
+ 'categorical': Contrast(method='drop')}
+ self.categorical_encoders_.update(**categorical_encoders)
def fit(self, X, y=None):
@@ -203,7 +246,7 @@ def fit(self, X, y=None):
X)
self.columns_ = X.columns
if self.is_categorical_ is None:
- self.is_categorical_ = np.zeros(X.shape[1], np.bool)
+ self.is_categorical_ = np.zeros(X.shape[1], bool)
self.is_ordinal_ = pd.Series(self.is_ordinal_,
index=self.columns_)
self.is_categorical_ = pd.Series(self.is_categorical_,
@@ -214,32 +257,33 @@ def fit(self, X, y=None):
self.known_categories_) = _check_categories(categorical_features,
X)
if self.is_categorical_ is None:
- self.is_categorical_ = np.zeros(X.shape[1], np.bool)
+ self.is_categorical_ = np.zeros(X.shape[1], bool)
self.is_ordinal_ = np.zeros(self.is_categorical_.shape,
- np.bool)
+ bool)
self.columns_ = np.arange(X.shape[1])
- self.variables_ = {}
+ self.features_ = {}
self.encoders_ = {}
self.column_info_ = _get_column_info(X,
self.columns_,
- self.is_categorical_,
- self.is_ordinal_,
- default_encoders=self.default_encoders)
- # include each column as a Variable
+ np.asarray(self.is_categorical_),
+ np.asarray(self.is_ordinal_),
+ categorical_encoders=self.categorical_encoders_)
+
+ # include each column as a Feature
# so that their columns are built if needed
for col_ in self.columns_:
- self.variables_[col_] = Variable((col_,), str(col_), None, pure_columns=True)
+ self.features_[col_] = Feature((col_,), str(col_), None, pure_columns=True)
- # find possible interactions and other variables
+ # find possible interactions and other features
tmp_cache = {}
for term in self.terms:
- if isinstance(term, Variable):
- self.variables_[term] = term
+ if isinstance(term, Feature):
+ self.features_[term] = term
build_columns(self.column_info_,
X,
term,
@@ -247,18 +291,18 @@ def fit(self, X, y=None):
col_cache=tmp_cache,
fit=True) # these encoders won't have been fit yet
for var in term.variables:
- if var not in self.variables_ and isinstance(var, Variable):
- self.variables_[var] = var
+ if var not in self.features_ and isinstance(var, Feature):
+ self.features_[var] = var
elif term not in self.column_info_:
- # a tuple of variables represents an interaction
+ # a tuple of features represents an interaction
if type(term) == type((1,)):
names = []
column_map = {}
column_names = {}
idx = 0
for var in term:
- if var in self.variables_:
- var = self.variables_[var]
+ if var in self.features_:
+ var = self.features_[var]
cols, cur_names = build_columns(self.column_info_,
X,
var,
@@ -270,17 +314,17 @@ def fit(self, X, y=None):
idx += cols.shape[1]
names.append(var.name)
encoder_ = Interaction(names, column_map, column_names)
- self.variables_[term] = Variable(term, ':'.join(n for n in names), encoder_)
+ self.features_[term] = Feature(term, ':'.join(n for n in names), encoder_)
elif isinstance(term, Column):
- self.variables_[term] = Variable((term,), term.name, None, pure_columns=True)
+ self.features_[term] = Feature((term,), term.name, None, pure_columns=True)
else:
- raise ValueError('each element in a term should be a Variable, Column or identify a column')
+ raise ValueError('each element in a term should be a Feature, Column or identify a column')
# build the mapping of terms to columns and column names
self.column_names_ = {}
self.column_map_ = {}
- self.terms_ = [self.variables_[t] for t in self.terms]
+ self.terms_ = [self.features_[t] for t in self.terms]
idx = 0
if self.intercept:
@@ -310,64 +354,48 @@ def transform(self, X, y=None):
Ignored. This parameter exists only for compatibility with
:py:class:`sklearn.pipeline.Pipeline`.
"""
- return self.build_submodel(X, self.terms_)
+ check_is_fitted(self)
+ return build_model(self.column_info_,
+ X,
+ self.terms_,
+ intercept=self.intercept,
+ encoders=self.encoders_)
# ModelSpec specific methods
- def build_submodel(self, X, terms):
+ @property
+ def names(self, help='Name for each term in model specification.'):
+ names = []
+ if self.intercept:
+ names = ['intercept']
+ return names + [t.name for t in self.terms_]
+
+ def build_submodel(self,
+ X,
+ terms):
"""
- Construct design matrix on a
- sequence of terms and X after
- fitting.
+ Build design on X after fitting.
Parameters
----------
X : array-like
- X on which model matrix will be evaluated.
+ X on which columns are evaluated.
+
+ terms : [Feature]
+ Sequence of features
Returns
-------
- df : np.ndarray or pd.DataFrame
- Design matrix.
+ D : array-like
+ Design matrix created with `terms`
"""
- check_is_fitted(self)
-
- dfs = []
-
- col_cache = {} # avoid recomputing the same columns
-
- if self.intercept:
- df = pd.DataFrame({'intercept':np.ones(X.shape[0])})
- if isinstance(X, (pd.Series, pd.DataFrame)):
- df.index = X.index
- dfs.append(df)
-
- for term_ in terms:
- term_df = build_columns(self.column_info_,
- X,
- term_,
- col_cache=col_cache,
- encoders=self.encoders_,
- fit=False)[0]
- dfs.append(term_df)
-
- if len(dfs):
- if isinstance(X, (pd.Series, pd.DataFrame)):
- df = pd.concat(dfs, axis=1)
- df.index = X.index
- return df
- else:
- return np.column_stack(dfs)
- else: # return a 0 design
- zero = np.zeros(X.shape[0])
- if isinstance(X, (pd.Series, pd.DataFrame)):
- df = pd.DataFrame({'zero': zero})
- df.index = X.index
- return df
- else:
- return zero
+ return build_model(self.column_info_,
+ X,
+ terms,
+ intercept=self.intercept,
+ encoders=self.encoders_)
def build_sequence(self,
X,
@@ -375,6 +403,21 @@ def build_sequence(self,
"""
Build implied sequence of submodels
based on successively including more terms.
+
+ Parameters
+ ----------
+ X : array-like
+ X on which columns are evaluated.
+
+ anova_type: str
+ One of "sequential" or "drop".
+
+ Returns
+ -------
+
+ models : generator
+ Generator for sequence of models for ANOVA.
+
"""
check_is_fitted(self)
@@ -427,8 +470,11 @@ def fit_encoder(encoders, var, X):
Parameters
----------
- var : Variable
- Variable whose encoder will be fit.
+ encoders : dict
+ Dictionary of encoders for each feature.
+
+ var : Feature
+ Feature whose encoder will be fit.
X : array-like
X on which encoder will be fit.
@@ -440,7 +486,7 @@ def fit_encoder(encoders, var, X):
def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
"""
- Build columns for a Variable from X.
+ Build columns for a Feature from X.
Parameters
----------
@@ -452,10 +498,13 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
X : array-like
X on which columns are evaluated.
- var : Variable
- Variable whose columns will be built, typically a key in `column_info`.
+ var : Feature
+ Feature whose columns will be built, typically a key in `column_info`.
- col_cache:
+ encoders : dict
+ Dict that stores encoder of each Feature.
+
+ col_cache: dict
Dict where columns will be stored --
if `var.name` in `col_cache` then just
returns those columns.
@@ -480,7 +529,7 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
cols, name = col_cache[joblib_hash([var, X])]
else:
cols, names = var.get_columns(X, fit=fit)
- elif isinstance(var, Variable):
+ elif isinstance(var, Feature):
cols = []
names = []
for v in var.variables:
@@ -495,16 +544,18 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
cols = np.column_stack(cols)
if len(names) != cols.shape[1]:
names = ['{0}[{1}]'.format(var.name, j) for j in range(cols.shape[1])]
-
if var.encoder:
+ df_cols = pd.DataFrame(np.asarray(cols),
+ columns=names)
try:
check_is_fitted(var.encoder)
if fit and var not in encoders:
raise ValueError('encoder has already been fit previously')
except NotFittedError as e:
if fit:
- fit_encoder(var, pd.DataFrame(np.asarray(cols),
- columns=names))
+ fit_encoder(encoders,
+ var,
+ df_cols)
# known issue with Pipeline
# https://github.com/scikit-learn/scikit-learn/issues/18648
elif isinstance(var.encoder, Pipeline):
@@ -514,9 +565,9 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
except Exception as e: # was not the NotFitted
raise ValueError(e)
if var.use_transform:
- cols = var.encoder.transform(cols)
+ cols = var.encoder.transform(df_cols)
else:
- cols = var.encoder.predict(cols)
+ cols = var.encoder.predict(df_cols)
if hasattr(var.encoder, 'columns_') and not var.override_encoder_colnames:
names = var.encoder.columns_
else:
@@ -527,7 +578,7 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
else:
- raise ValueError('expecting either a column or a Variable')
+ raise ValueError('expecting either a column or a Feature')
val = pd.DataFrame(np.asarray(cols), columns=names)
if isinstance(X, (pd.DataFrame, pd.Series)):
@@ -537,16 +588,88 @@ def build_columns(column_info, X, var, encoders={}, col_cache={}, fit=False):
col_cache[joblib_hash([var.name, X])] = (val, names)
return val, names
+def build_model(column_info,
+ X,
+ terms,
+ intercept=True,
+ encoders={}):
+
+ """
+ Construct design matrix on a
+ sequence of terms and X after
+ fitting.
+
+ Parameters
+ ----------
+ column_info: dict
+ Dictionary with values specifying sets of columns to
+ be concatenated into a design matrix.
+
+ X : array-like
+ X on which columns are evaluated.
+
+ terms : [Feature]
+ Sequence of features
+
+ encoders : dict
+ Dict that stores encoder of each Feature.
-def derived_variable(variables, encoder=None, name=None, use_transform=True):
+ Returns
+ -------
+ df : np.ndarray or pd.DataFrame
+ Design matrix.
"""
- Create a Variable, optionally
+
+ dfs = []
+
+ col_cache = {} # avoid recomputing the same columns
+
+ if intercept:
+ df = pd.DataFrame({'intercept':np.ones(X.shape[0])})
+ if isinstance(X, (pd.Series, pd.DataFrame)):
+ df.index = X.index
+ dfs.append(df)
+
+ for term_ in terms:
+ term_df = build_columns(column_info,
+ X,
+ term_,
+ col_cache=col_cache,
+ encoders=encoders,
+ fit=False)[0]
+ dfs.append(term_df)
+
+ if len(dfs):
+ if isinstance(X, (pd.Series, pd.DataFrame)):
+ df = pd.concat(dfs, axis='columns')
+ df.index = X.index
+ else:
+ return np.column_stack(dfs).astype(float)
+ else: # return a 0 design
+ zero = np.zeros(X.shape[0])
+ if isinstance(X, (pd.Series, pd.DataFrame)):
+ df = pd.DataFrame({'zero': zero})
+ df.index = X.index
+ else:
+ return zero
+
+ # if we reach here, we will be returning a DataFrame
+ # make sure all columns are floats
+
+ for i, col in enumerate(df.columns):
+ if df.iloc[:,i].dtype == bool:
+ df[col] = df.iloc[:,i].astype(float)
+ return df
+
+def derived_feature(variables, encoder=None, name=None, use_transform=True):
+ """
+ Create a Feature, optionally
applying an encoder to the stacked columns.
Parameters
----------
- variables : [column identifier, Column, Variable]
+ variables : [column identifier, Column, Feature]
Variables to apply transform to. Could be
column identifiers or variables: all columns
will be stacked before encoding.
@@ -560,12 +683,12 @@ def derived_variable(variables, encoder=None, name=None, use_transform=True):
Returns
-------
- var : Variable
+ var : Feature
"""
if name is None:
name = str(encoder)
- var = Variable(tuple([v for v in variables]),
+ var = Feature(tuple([v for v in variables]),
name,
encoder,
use_transform=use_transform,
@@ -590,7 +713,7 @@ def contrast(col,
Returns
-------
- var : Variable
+ var : Feature
"""
@@ -606,7 +729,7 @@ def contrast(col,
is_categorical=True,
encoder=encoder)
-def ordinal(col, *args, **kwargs):
+def ordinal(col, name=None, *args, **kwargs):
"""
Create ordinal encoding of categorical feature.
@@ -618,7 +741,7 @@ def ordinal(col, *args, **kwargs):
Returns
-------
- var : Variable
+ var : Feature
"""
@@ -637,7 +760,7 @@ def ordinal(col, *args, **kwargs):
name = f'{shortname}({name})'
- return derived_variable([col],
+ return derived_feature([col],
name=name,
encoder=encoder)
@@ -648,7 +771,7 @@ def poly(col,
name=None):
"""
- Create a polynomial Variable
+ Create a polynomial Feature
for a given column.
Additional `args` and `kwargs`
@@ -676,7 +799,7 @@ def poly(col,
Returns
-------
- var : Variable
+ var : Feature
"""
shortname, klass = 'poly', Poly
encoder = klass(degree=degree,
@@ -701,13 +824,13 @@ def poly(col,
name = f'{shortname}({name})'
- return derived_variable([col],
+ return derived_feature([col],
name=name,
encoder=encoder)
def ns(col, intercept=False, name=None, **spline_args):
"""
- Create a natural spline Variable
+ Create a natural spline Feature
for a given column.
Additional *spline_args*
@@ -727,7 +850,7 @@ def ns(col, intercept=False, name=None, **spline_args):
Returns
-------
- var : Variable
+ var : Feature
"""
shortname, klass = 'ns', NaturalSpline
@@ -744,13 +867,13 @@ def ns(col, intercept=False, name=None, **spline_args):
name = f'{shortname}({name})'
encoder = klass(intercept=intercept,
**spline_args)
- return derived_variable([col],
+ return derived_feature([col],
name=name,
encoder=encoder)
def bs(col, intercept=False, name=None, **spline_args):
"""
- Create a B-spline Variable
+ Create a B-spline Feature
for a given column.
Additional args and *spline_args*
@@ -771,7 +894,7 @@ def bs(col, intercept=False, name=None, **spline_args):
Returns
-------
- var : Variable
+ var : Feature
"""
shortname, klass = 'bs', BSpline
@@ -788,7 +911,7 @@ def bs(col, intercept=False, name=None, **spline_args):
name = f'{shortname}({name})'
encoder = klass(intercept=intercept,
**spline_args)
- return derived_variable([col],
+ return derived_feature([col],
name=name,
encoder=encoder)
@@ -803,13 +926,13 @@ def pca(variables, name, scale=False, **pca_args):
Parameters
----------
- variables : [column identifier, Column or Variable]
+ variables : [column identifier, Column or Feature]
Sequence whose columns will be encoded by PCA.
Returns
-------
- var : Variable
+ var : Feature
"""
shortname, klass = 'pca', PCA
@@ -824,52 +947,10 @@ def pca(variables, name, scale=False, **pca_args):
if _args:
name = ', '.join([name, _args])
- return derived_variable(variables,
+ return derived_feature(variables,
name=f'{shortname}({name})',
encoder=encoder)
-# def clusterer(variables, name, transform, scale=False):
-# """
-# Create PCA encoding of features
-# from a sequence of variables.
-
-# Additional `args` and `kwargs`
-# are passed to `PCA`.
-
-# Parameters
-# ----------
-
-# variables : [column identifier, Column or Variable]
-# Sequence whose columns will be encoded by PCA.
-
-# name: str
-# name for the Variable
-
-# transform: Transformer
-# A transform with a `predict` method.
-
-# Returns
-# -------
-
-# var : Variable
-
-# """
-
-# if scale:
-# scaler = StandardScaler(with_mean=True,
-# with_std=True)
-# encoder = make_pipeline(scaler, transform)
-# else:
-# encoder = transform
-
-# intermed = Variable((derived_variable(*variables,
-# name='cluster_intermed',
-# encoder=encoder,
-# use_transform=False),),
-# name=f'Cat({encoder}({name}))',
-# encoder=Contrast(method='drop'))
-
-# return intermed
def _argstring(*args, **kwargs):
_args = ', '.join([str(a) for a in args])
diff --git a/ISLP/models/sklearn_wrap.py b/ISLP/models/sklearn_wrap.py
index 123130b..121da75 100644
--- a/ISLP/models/sklearn_wrap.py
+++ b/ISLP/models/sklearn_wrap.py
@@ -49,7 +49,17 @@ def __init__(self,
self.model_type = model_type
self.model_spec = model_spec
self.model_args = model_args
-
+
+ def __sklearn_tags__(self):
+ tags = super().__sklearn_tags__()
+ if self.model_type == sm.OLS:
+ tags.estimator_type = 'regressor'
+ elif (issubclass(self.model_type, sm.GLM) and
+ 'family' in self.model_args and
+ isinstance(self.model_args.get('family', None), sm.families.Binomial)):
+ tags.estimator_type = 'classifier'
+ return tags
+
def fit(self, X, y):
"""
Fit a statsmodel model
@@ -171,6 +181,9 @@ def __init__(self,
self.cv = cv
self.scoring = scoring
+ def __sklearn_tags__(self):
+ tags = super().__sklearn_tags__()
+ return tags
def fit(self, X, y):
"""
diff --git a/ISLP/models/strategy.py b/ISLP/models/strategy.py
index 028ac94..f237db3 100644
--- a/ISLP/models/strategy.py
+++ b/ISLP/models/strategy.py
@@ -74,9 +74,9 @@ def __init__(self,
Minumum number of terms to select
max_terms: int (default: 0)
Maximum number of terms to select
- lower_terms: [Variable]
+ lower_terms: [Feature]
Subset of terms to keep: smallest model.
- upper_terms: [Variable]
+ upper_terms: [Feature]
Largest possible model.
validator: callable
Callable taking a single argument: state,
@@ -216,9 +216,9 @@ class Stepwise(MinMaxCandidates):
Minumum number of terms to select
max_terms: int (default: 1)
Maximum number of terms to select
- lower_terms: [Variable]
+ lower_terms: [Feature]
Subset of terms to keep: smallest model.
- upper_terms: [Variable]
+ upper_terms: [Feature]
Largest possible model.
constraints: {array-like} (optional), shape [n_terms, n_terms]
Boolean matrix decribing a dag with [i,j] nonzero implying that j is
@@ -342,9 +342,9 @@ def first_peak(model_spec,
Minumum number of terms to select
max_terms: int (default: 1)
Maximum number of terms to select
- lower_terms: [Variable]
+ lower_terms: [Feature]
Subset of terms to keep: smallest model.
- upper_terms: [Variable]
+ upper_terms: [Feature]
Largest possible model.
initial_terms: column identifiers, default=[]
Subset of terms to be used to initialize when direction
@@ -441,9 +441,9 @@ def fixed_steps(model_spec,
max_terms: int (default: None)
Maximum number of terms to select.
If None defaults to number of terms in *model_spec*.
- lower_terms: [Variable]
+ lower_terms: [Feature]
Subset of terms to keep: smallest model.
- upper_terms: [Variable]
+ upper_terms: [Feature]
Largest possible model.
initial_terms: column identifiers, default=[]
Subset of terms to be used to initialize.
@@ -506,9 +506,9 @@ def min_max(model_spec,
Minumum number of terms to select
max_terms: int (default: 1)
Maximum number of terms to select
- lower_terms: [Variable]
+ lower_terms: [Feature]
Subset of terms to keep: smallest model.
- upper_terms: [Variable]
+ upper_terms: [Feature]
Largest possible model.
validator: callable
Callable taking a single argument: state,
diff --git a/ISLP/survival.py b/ISLP/survival.py
index b11967b..c352942 100644
--- a/ISLP/survival.py
+++ b/ISLP/survival.py
@@ -14,7 +14,7 @@
def sim_time(linpred,
cum_hazard,
- rng):
+ rng=None):
"""
Simulate a survival time for a
cumulative hazard function $H$ with cumulative hazard
@@ -39,6 +39,9 @@ def sim_time(linpred,
Used to generate survival times.
"""
+ if rng is None:
+ rng = np.random.default_rng()
+
U = rng.uniform()
B = - np.log(U) / np.exp(linpred)
lower, upper = 1, 2
diff --git a/ISLP/svm.py b/ISLP/svm.py
index bedf288..8afcd5a 100644
--- a/ISLP/svm.py
+++ b/ISLP/svm.py
@@ -28,6 +28,12 @@ def plot(X,
'''
Graphical representation of fitted support vector classifier.
+ There are two types of support vectors:
+
+ - Points violating the margin but correctly classified. These are marked with a black '+'.
+
+ - Misclassified points. These are marked with a red 'x'.
+
Parameters
----------
@@ -89,7 +95,7 @@ def plot(X,
# draw the points
- ax.scatter(X0, X1, c=Y, cmap=scatter_cmap)
+ ax.scatter(X0, X1, c=Y, cmap=scatter_cmap, s=200)
# add the contour
@@ -113,8 +119,27 @@ def plot(X,
cmap=decision_cmap,
alpha=alpha)
- # add the support vectors
+ decision_val = svm.decision_function(X_pred)
- ax.scatter(X[svm.support_,features[0]],
- X[svm.support_,features[1]], marker='+', c='k', s=200)
+ # add the support vectors
+ if svm.classes_.shape[0] == 2: # 2-class problem
+
+ ax.contourf(xval,
+ yval,
+ decision_val.reshape(yval.shape),
+ levels=[-1,1],
+ cmap=decision_cmap,
+ alpha=alpha)
+
+ D = svm.decision_function(X[svm.support_])
+ Y_ = (2 * (Y[svm.support_] == svm.classes_[1]) - 1)
+ violate_margin = (Y_ * D) > 0
+ ax.scatter(X[svm.support_,features[0]][violate_margin],
+ X[svm.support_,features[1]][violate_margin], marker='+', c='k', s=50)
+ misclassified = ~violate_margin
+ ax.scatter(X[svm.support_,features[0]][misclassified],
+ X[svm.support_,features[1]][misclassified], marker='x', c='r', s=50)
+ else:
+ ax.scatter(X[svm.support_,features[0]],
+ X[svm.support_,features[1]], marker='+', c='k', s=50)
diff --git a/ISLP/torch/imdb.py b/ISLP/torch/imdb.py
index 617489d..3dfacfe 100644
--- a/ISLP/torch/imdb.py
+++ b/ISLP/torch/imdb.py
@@ -12,7 +12,6 @@
import torch
from torch.utils.data import TensorDataset
from scipy.sparse import load_npz
-from pkg_resources import resource_filename
from pickle import load as load_pickle
import urllib
diff --git a/ISLP/torch/lightning.py b/ISLP/torch/lightning.py
index 82c45db..d7056ec 100644
--- a/ISLP/torch/lightning.py
+++ b/ISLP/torch/lightning.py
@@ -7,14 +7,14 @@
DataLoader,
Dataset)
from torch import tensor, Generator, concat
-from torchvision import transforms
+
from torch.utils.data import TensorDataset
from torchmetrics import Accuracy
from pytorch_lightning import (LightningModule,
LightningDataModule)
-from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities import rank_zero_only
from pytorch_lightning.callbacks import Callback
class SimpleDataModule(LightningDataModule):
@@ -132,14 +132,15 @@ def __init__(self,
model,
loss,
optimizer=None,
- metrics={},
+ metrics=None,
on_epoch=True,
pre_process_y_for_metrics=lambda y: y):
super(SimpleModule, self).__init__()
self.model = model
- self.loss = loss or nn.MSELoss()
+ self.loss = loss
+
optimizer = optimizer or RMSprop(model.parameters())
self._optimizer = optimizer
self.metrics = metrics
@@ -160,8 +161,10 @@ def training_step(self, batch, batch_idx):
y_ = self.pre_process_y_for_metrics(y)
for _metric in self.metrics.keys():
+ pl_metric = self.metrics[_metric]
self.log(f"train_{_metric}",
- self.metrics[_metric](preds, y_),
+ pl_metric(preds.to(pl_metric.device),
+ y_.to(pl_metric.device)),
on_epoch=self.on_epoch)
return loss
@@ -181,22 +184,36 @@ def configure_optimizers(self):
@staticmethod
def regression(model,
+ metrics=None,
+ device='cpu',
**kwargs):
- loss = nn.MSELoss()
+
+ if metrics is None:
+ metrics = {}
+
+ loss = nn.MSELoss().to(device)
+ if device is not None:
+ for key, metric in metrics.items():
+ metrics[key] = metric.to(device)
return SimpleModule(model,
loss,
+ metrics=metrics,
**kwargs)
@staticmethod
def binary_classification(model,
- metrics={},
- device=None,
+ metrics=None,
+ device='cpu',
**kwargs):
+
+ if metrics is None:
+ metrics = {}
+
loss = nn.BCEWithLogitsLoss()
if 'accuracy' not in metrics:
- metrics['accuracy'] = Accuracy()
+ metrics['accuracy'] = Accuracy('binary')
if device is not None:
- for key, metric in metrics:
+ for key, metric in metrics.items():
metrics[key] = metric.to(device)
return SimpleModule(model,
loss,
@@ -206,14 +223,20 @@ def binary_classification(model,
@staticmethod
def classification(model,
- metrics={},
- device=None,
+ num_classes,
+ metrics=None,
+ device='cpu',
**kwargs):
- loss = nn.CrossEntropyLoss()
+
+ if metrics is None:
+ metrics = {}
+
+ loss = nn.CrossEntropyLoss().to(device)
if 'accuracy' not in metrics:
- metrics['accuracy'] = Accuracy()
+ metrics['accuracy'] = Accuracy('multiclass',
+ num_classes=num_classes)
if device is not None:
- for key, metric in metrics:
+ for key, metric in metrics.items():
metrics[key] = metric.to(device)
return SimpleModule(model,
loss,
@@ -233,7 +256,7 @@ def on_validation_batch_start(self,
pl_module,
batch,
batch_idx,
- dataloader_idx):
+ dataloader_idx=0):
x, y = batch
self.val_preds.append(pl_module.forward(x))
self.val_targets.append(y)
@@ -252,8 +275,10 @@ def on_validation_epoch_end(self,
on_epoch=pl_module.on_epoch)
for _metric in pl_module.metrics.keys():
+ pl_metric = pl_module.metrics[_metric]
pl_module.log(f"valid_{_metric}",
- pl_module.metrics[_metric](preds, targets_),
+ pl_metric(preds.to(pl_metric.device),
+ targets_.to(pl_metric.device)),
on_epoch=pl_module.on_epoch)
def on_test_epoch_start(self,
@@ -267,7 +292,7 @@ def on_test_batch_start(self,
pl_module,
batch,
batch_idx,
- dataloader_idx):
+ dataloader_idx=0):
x, y = batch
self.test_preds.append(pl_module.forward(x))
self.test_targets.append(y)
@@ -286,7 +311,9 @@ def on_test_epoch_end(self,
on_epoch=pl_module.on_epoch)
for _metric in pl_module.metrics.keys():
+ pl_metric = pl_module.metrics[_metric]
pl_module.log(f"test_{_metric}",
- pl_module.metrics[_metric](preds, targets_),
+ pl_metric(preds.to(pl_metric.device),
+ targets_.to(pl_metric.device)),
on_epoch=pl_module.on_epoch)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..dd8ced0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ (1) Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ (2) Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ (3)The name of the author may not be used to
+ endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/README.md b/README.md
index eb283fa..546ddba 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,99 @@
# ISLP
+
+[](#contributors-)
+
This package collects data sets and various helper functions
for ISLP.
## Install instructions
-### Mac OS X
+### Mac OS X / Linux
+
+We generally recommend creating a [conda](https://anaconda.org) environment to isolate any code
+from other dependencies. The `ISLP` package does not have unusual dependencies, but this is still
+good practice. To create a conda environment in a Mac OS X or Linux environment run:
```{python}
-pip install ISLP
+conda create --name islp
+```
+
+To run python code in this environment, you must activate it:
+
+```{python}
+conda activate islp
```
### Windows
-See the [https://packaging.python.org/en/latest/tutorials/installing-packages/#ensure-you-can-run-pip-from-the-command-line](python-packaging-instructions) for a simple way to run `pip` within
-Jupyter.
+On windows, create a `Python` environment called `islp` in the Anaconda app. This can be done by selecting `Environments` on the left hand side of the app's screen. After creating the environment, open a terminal within that environment by clicking on the "Play" button.
-Alternatively, within a python shell, the following commands should install `ISLP`:
+
+## Installing `ISLP`
+
+Having completed the steps above, we use `pip` to install the `ISLP` package:
```{python}
-import os, sys
-cmd = f'{sys.executable} -m pip install ISLP'
-os.system(cmd)
+pip install ISLP
```
### Torch requirements
The `ISLP` labs use `torch` and various related packages for the lab on deep learning. The requirements
-can be found [here](requirements.txt). Alternatively, you can install them directly using `pip`
+are included in the requirements for `ISLP` with the exception of those needed
+for the labs which are included in the [requirements for the labs](https://github.com/intro-stat-learning/ISLP_labs/blob/main/requirements.txt).
+
+## Jupyter
+
+### Mac OS X
+
+If JupyterLab is not already installed, run the following after having activated your `islp` environment:
```{python}
-reqs = 'https://raw.githubusercontent.com/jonathan-taylor/ISLP/master/requirements.txt'
-cmd = f'{sys.executable} -m pip install -r {reqs}'
-os.system(cmd)
+pip install jupyterlab
```
+### Windows
+
+Either use the same `pip` command above or install JupyterLab from the `Home` tab. Ensure that the environment
+is your `islp` environment. This information appears near the top left in the Anaconda `Home` page.
+
+
## Documentation
-See the [read the docs](https://islp.readthedocs.io/en/latest/models.html)
+See the [docs](https://intro-stat-learning.github.io/ISLP/labs.html) for the latest documentation.
+
+## Authors
+
+- Jonathan Taylor
+- Trevor Hastie
+- Gareth James
+- Robert Tibshirani
+- Daniela Witten
+
+
+
+
+## Contributors ✨
+
+Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
+
+
+
+
+
+
+
+
+This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
\ No newline at end of file
diff --git a/docs/ISLP_labs b/docs/ISLP_labs
new file mode 160000
index 0000000..5d793a3
--- /dev/null
+++ b/docs/ISLP_labs
@@ -0,0 +1 @@
+Subproject commit 5d793a33a8d5025181439b8d0f193c37c69ee20a
diff --git a/docs/README.rst b/docs/README.rst
new file mode 100644
index 0000000..41df584
--- /dev/null
+++ b/docs/README.rst
@@ -0,0 +1,15 @@
+Deep learning
+=============
+
+This lab should be run as a notebook and saved
+
+Ridge regression
+================
+
+There is a snippet that should be inserted to remove the many warnings raised.
+
+Frozen reqs
+===========
+
+The versions of the labs are referred to in `source/installation.myst`, `source/labs.rst`. Version built
+on `readthedocs` is references in `fix_and_run_notebooks.py`
diff --git a/docs/fix_and_clear_notebooks.py b/docs/fix_and_clear_notebooks.py
new file mode 100644
index 0000000..50eebe2
--- /dev/null
+++ b/docs/fix_and_clear_notebooks.py
@@ -0,0 +1,127 @@
+
+from dataclasses import dataclass
+from copy import copy
+
+import shlex
+import subprocess
+import os
+import sys
+import json
+import nbformat
+from argparse import ArgumentParser
+
+def get_version():
+ import __main__
+ dirname = os.path.split(__main__.__file__)[0]
+ sys.path.append(os.path.join(dirname, 'source'))
+ from conf import docs_version
+ sys.path = sys.path[:-1]
+ return docs_version
+
+
+@dataclass
+class Lab(object):
+
+ labfile: str
+ version: str = 'v2'
+ rm_md: bool = True
+
+ def __post_init__(self):
+ self.labfile = os.path.abspath(self.labfile)
+
+ def fix_header(self):
+ labname = os.path.split(self.labfile)[1]
+ base = os.path.splitext(self.labfile)[0]
+ args = shlex.split(f'jupytext --set-formats ipynb,md:myst {self.labfile}')
+ subprocess.run(args)
+
+ # successful run of jupytext
+ myst = open(f'{base}.md').read().strip()
+ split_myst = myst.split('\n')
+ new_myst = []
+
+ colab_code = f'''
+
+
+
+
+[](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/{self.version}?labpath={labname})
+
+'''
+
+ chapter_buffer = 200 # should use a regex...
+ for l in split_myst[:chapter_buffer]: # assumes Chapter appears in first 200 linesmyst.split('\n')
+ if l.strip()[:9] != '# Chapter': # exclude the line with "# Chapter"
+ if 'Lab:' in l:
+ l = l.replace('Lab:', '') + '\n' + colab_code
+ new_myst.append(l)
+
+ myst = '\n'.join(new_myst + split_myst[chapter_buffer:])
+
+ open(f'{base}.md', 'w').write(myst)
+
+ args = shlex.split(f'jupytext --sync {base}.ipynb')
+ subprocess.run(args)
+
+ args = shlex.split(f'jupytext --set-formats Rmd,ipynb {base}.ipynb')
+ subprocess.run(args)
+
+ args = shlex.split(f'jupytext --sync {base}.ipynb')
+ subprocess.run(args)
+
+ if self.rm_md:
+ subprocess.run(['rm', f'{base}.md'])
+
+def fix_Ch06(Ch06_nbfile):
+
+ nb = nbformat.read(open(Ch06_nbfile), 4)
+
+ md_cell = copy(nb.cells[0])
+ md_cell['id'] = md_cell['id'] + '_duplicate'
+
+ src = '''
+
+```{attention}
+Using `skl.ElasticNet` to fit ridge regression
+throws up many warnings. We have suppressed them below by a call to `warnings.simplefilter()`.
+```
+
+'''
+
+ md_cell['source'] = [l +'\n' for l in src.split('\n')]
+
+ for i, cell in enumerate(nb.cells):
+ if cell['cell_type'] == 'code':
+ code_cell = copy(cell)
+ code_cell['id'] = code_cell['id'] + '_duplicate'
+ code_cell['source'] = ['import warnings\n', 'warnings.simplefilter("ignore")\n']
+ break
+
+ nb.cells.insert(i, md_cell)
+ nb.cells.insert(i+1, code_cell)
+
+ nbformat.write(nb, open(Ch06_nbfile, 'w'))
+ subprocess.run(shlex.split(f'jupytext --sync {Ch06_nbfile}'))
+
+if __name__ == "__main__":
+
+ docs_version = get_version()
+
+ parser = ArgumentParser()
+ parser.add_argument('labs',
+ metavar='N',
+ type=str,
+ nargs='+')
+ parser.add_argument('--rm_md',
+ dest='rm_md',
+ action='store_true',
+ default=False)
+
+ args = parser.parse_args()
+
+ for labfile in args.labs:
+ l = Lab(labfile=labfile, version=docs_version['labs'])
+ l.fix_header()
+ if '06' in labfile:
+ fix_Ch06(labfile)
+
diff --git a/docs/jupyterbook/datasets/Auto.ipynb b/docs/jupyterbook/datasets/Auto.ipynb
index f84fbfc..b88ea02 100644
--- a/docs/jupyterbook/datasets/Auto.ipynb
+++ b/docs/jupyterbook/datasets/Auto.ipynb
@@ -88,9 +88,9 @@
"formats": "source/datasets///ipynb,jupyterbook/datasets///md:myst,jupyterbook/datasets///ipynb"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Auto.md b/docs/jupyterbook/datasets/Auto.md
index fe851ed..627d70b 100644
--- a/docs/jupyterbook/datasets/Auto.md
+++ b/docs/jupyterbook/datasets/Auto.md
@@ -5,11 +5,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Auto Data
diff --git a/docs/jupyterbook/datasets/Bikeshare.ipynb b/docs/jupyterbook/datasets/Bikeshare.ipynb
index b0edebc..ddb1053 100644
--- a/docs/jupyterbook/datasets/Bikeshare.ipynb
+++ b/docs/jupyterbook/datasets/Bikeshare.ipynb
@@ -102,9 +102,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Bikeshare.md b/docs/jupyterbook/datasets/Bikeshare.md
index 90e1f7f..380bc1b 100644
--- a/docs/jupyterbook/datasets/Bikeshare.md
+++ b/docs/jupyterbook/datasets/Bikeshare.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Bike sharing data
diff --git a/docs/jupyterbook/datasets/Boston.ipynb b/docs/jupyterbook/datasets/Boston.ipynb
index 1b5dce0..569f5b4 100644
--- a/docs/jupyterbook/datasets/Boston.ipynb
+++ b/docs/jupyterbook/datasets/Boston.ipynb
@@ -95,9 +95,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Boston.md b/docs/jupyterbook/datasets/Boston.md
index 60b6f5e..1146a86 100644
--- a/docs/jupyterbook/datasets/Boston.md
+++ b/docs/jupyterbook/datasets/Boston.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Boston Data
diff --git a/docs/jupyterbook/datasets/BrainCancer.ipynb b/docs/jupyterbook/datasets/BrainCancer.ipynb
index fd8e84e..cb75946 100644
--- a/docs/jupyterbook/datasets/BrainCancer.ipynb
+++ b/docs/jupyterbook/datasets/BrainCancer.ipynb
@@ -95,9 +95,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/BrainCancer.md b/docs/jupyterbook/datasets/BrainCancer.md
index 3e1a2be..7307a69 100644
--- a/docs/jupyterbook/datasets/BrainCancer.md
+++ b/docs/jupyterbook/datasets/BrainCancer.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Brain Cancer Data
diff --git a/docs/jupyterbook/datasets/Caravan.ipynb b/docs/jupyterbook/datasets/Caravan.ipynb
index ad1af58..f093422 100644
--- a/docs/jupyterbook/datasets/Caravan.ipynb
+++ b/docs/jupyterbook/datasets/Caravan.ipynb
@@ -63,9 +63,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Caravan.md b/docs/jupyterbook/datasets/Caravan.md
index a42ddb1..24f8335 100644
--- a/docs/jupyterbook/datasets/Caravan.md
+++ b/docs/jupyterbook/datasets/Caravan.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Caravan
diff --git a/docs/jupyterbook/datasets/Carseats.ipynb b/docs/jupyterbook/datasets/Carseats.ipynb
index 911e767..dfd36d4 100644
--- a/docs/jupyterbook/datasets/Carseats.ipynb
+++ b/docs/jupyterbook/datasets/Carseats.ipynb
@@ -83,9 +83,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Carseats.md b/docs/jupyterbook/datasets/Carseats.md
index 3c74d37..76f56e4 100644
--- a/docs/jupyterbook/datasets/Carseats.md
+++ b/docs/jupyterbook/datasets/Carseats.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Sales of Child Car Seats
diff --git a/docs/jupyterbook/datasets/College.ipynb b/docs/jupyterbook/datasets/College.ipynb
index ef2f53d..af1027d 100644
--- a/docs/jupyterbook/datasets/College.ipynb
+++ b/docs/jupyterbook/datasets/College.ipynb
@@ -104,9 +104,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/College.md b/docs/jupyterbook/datasets/College.md
index 5e2e422..95b0bb3 100644
--- a/docs/jupyterbook/datasets/College.md
+++ b/docs/jupyterbook/datasets/College.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# U.S. News and World Report's College Data
diff --git a/docs/jupyterbook/datasets/Credit.ipynb b/docs/jupyterbook/datasets/Credit.ipynb
index c4c79b5..f5e51a9 100644
--- a/docs/jupyterbook/datasets/Credit.ipynb
+++ b/docs/jupyterbook/datasets/Credit.ipynb
@@ -89,9 +89,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Credit.md b/docs/jupyterbook/datasets/Credit.md
index 36d2502..51de59d 100644
--- a/docs/jupyterbook/datasets/Credit.md
+++ b/docs/jupyterbook/datasets/Credit.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Credit Card Balance Data
diff --git a/docs/jupyterbook/datasets/Default.ipynb b/docs/jupyterbook/datasets/Default.ipynb
index 4799474..64357ef 100644
--- a/docs/jupyterbook/datasets/Default.ipynb
+++ b/docs/jupyterbook/datasets/Default.ipynb
@@ -83,9 +83,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Default.md b/docs/jupyterbook/datasets/Default.md
index f1c9acc..5aeaed2 100644
--- a/docs/jupyterbook/datasets/Default.md
+++ b/docs/jupyterbook/datasets/Default.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Credit Card Default Data
diff --git a/docs/jupyterbook/datasets/Fund.ipynb b/docs/jupyterbook/datasets/Fund.ipynb
index 905528d..fce1859 100644
--- a/docs/jupyterbook/datasets/Fund.ipynb
+++ b/docs/jupyterbook/datasets/Fund.ipynb
@@ -51,9 +51,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Fund.md b/docs/jupyterbook/datasets/Fund.md
index 4e53d4f..89009c2 100644
--- a/docs/jupyterbook/datasets/Fund.md
+++ b/docs/jupyterbook/datasets/Fund.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Fund Manager Data
diff --git a/docs/jupyterbook/datasets/Hitters.ipynb b/docs/jupyterbook/datasets/Hitters.ipynb
index 295f50b..6f261cd 100644
--- a/docs/jupyterbook/datasets/Hitters.ipynb
+++ b/docs/jupyterbook/datasets/Hitters.ipynb
@@ -110,9 +110,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Hitters.md b/docs/jupyterbook/datasets/Hitters.md
index 7f8d6b7..2fdecf0 100644
--- a/docs/jupyterbook/datasets/Hitters.md
+++ b/docs/jupyterbook/datasets/Hitters.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Baseball Data
diff --git a/docs/jupyterbook/datasets/Khan.ipynb b/docs/jupyterbook/datasets/Khan.ipynb
index a1f89a4..f12a5ca 100644
--- a/docs/jupyterbook/datasets/Khan.ipynb
+++ b/docs/jupyterbook/datasets/Khan.ipynb
@@ -81,9 +81,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Khan.md b/docs/jupyterbook/datasets/Khan.md
index f943e99..6f0c303 100644
--- a/docs/jupyterbook/datasets/Khan.md
+++ b/docs/jupyterbook/datasets/Khan.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Khan Gene Data
diff --git a/docs/jupyterbook/datasets/NCI60.ipynb b/docs/jupyterbook/datasets/NCI60.ipynb
index d8e2aec..bbb576f 100644
--- a/docs/jupyterbook/datasets/NCI60.ipynb
+++ b/docs/jupyterbook/datasets/NCI60.ipynb
@@ -62,9 +62,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/NCI60.md b/docs/jupyterbook/datasets/NCI60.md
index 4cc96c6..621445e 100644
--- a/docs/jupyterbook/datasets/NCI60.md
+++ b/docs/jupyterbook/datasets/NCI60.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# NCI 60 Data
diff --git a/docs/jupyterbook/datasets/NYSE.ipynb b/docs/jupyterbook/datasets/NYSE.ipynb
index d884201..5f9dbd5 100644
--- a/docs/jupyterbook/datasets/NYSE.ipynb
+++ b/docs/jupyterbook/datasets/NYSE.ipynb
@@ -79,9 +79,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/NYSE.md b/docs/jupyterbook/datasets/NYSE.md
index a84a9d4..bdb9581 100644
--- a/docs/jupyterbook/datasets/NYSE.md
+++ b/docs/jupyterbook/datasets/NYSE.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# New York Stock Exchange Data
diff --git a/docs/jupyterbook/datasets/OJ.ipynb b/docs/jupyterbook/datasets/OJ.ipynb
index 30046cb..e18a4de 100644
--- a/docs/jupyterbook/datasets/OJ.ipynb
+++ b/docs/jupyterbook/datasets/OJ.ipynb
@@ -107,9 +107,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/OJ.md b/docs/jupyterbook/datasets/OJ.md
index 8681ea9..94fd7c6 100644
--- a/docs/jupyterbook/datasets/OJ.md
+++ b/docs/jupyterbook/datasets/OJ.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Orange Juice Data
diff --git a/docs/jupyterbook/datasets/Portfolio.ipynb b/docs/jupyterbook/datasets/Portfolio.ipynb
index 0596162..6d6a60d 100644
--- a/docs/jupyterbook/datasets/Portfolio.ipynb
+++ b/docs/jupyterbook/datasets/Portfolio.ipynb
@@ -68,9 +68,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Portfolio.md b/docs/jupyterbook/datasets/Portfolio.md
index e130b81..3a79d35 100644
--- a/docs/jupyterbook/datasets/Portfolio.md
+++ b/docs/jupyterbook/datasets/Portfolio.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Portfolio Data
diff --git a/docs/jupyterbook/datasets/Publication.ipynb b/docs/jupyterbook/datasets/Publication.ipynb
index c97b201..a4a6dfa 100644
--- a/docs/jupyterbook/datasets/Publication.ipynb
+++ b/docs/jupyterbook/datasets/Publication.ipynb
@@ -91,9 +91,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Publication.md b/docs/jupyterbook/datasets/Publication.md
index 94c18bd..78261af 100644
--- a/docs/jupyterbook/datasets/Publication.md
+++ b/docs/jupyterbook/datasets/Publication.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Time-to-Publication Data
diff --git a/docs/jupyterbook/datasets/Smarket.ipynb b/docs/jupyterbook/datasets/Smarket.ipynb
index 35a1918..cced2a9 100644
--- a/docs/jupyterbook/datasets/Smarket.ipynb
+++ b/docs/jupyterbook/datasets/Smarket.ipynb
@@ -87,9 +87,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Smarket.md b/docs/jupyterbook/datasets/Smarket.md
index a42c94e..2c0e120 100644
--- a/docs/jupyterbook/datasets/Smarket.md
+++ b/docs/jupyterbook/datasets/Smarket.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# S&P Stock Market Data
diff --git a/docs/jupyterbook/datasets/USArrests.ipynb b/docs/jupyterbook/datasets/USArrests.ipynb
index 4a6a1c0..1107424 100644
--- a/docs/jupyterbook/datasets/USArrests.ipynb
+++ b/docs/jupyterbook/datasets/USArrests.ipynb
@@ -202,9 +202,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
diff --git a/docs/jupyterbook/datasets/USArrests.md b/docs/jupyterbook/datasets/USArrests.md
index 7cbede1..ee3c962 100644
--- a/docs/jupyterbook/datasets/USArrests.md
+++ b/docs/jupyterbook/datasets/USArrests.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Violent Crime Rates by US State
diff --git a/docs/jupyterbook/datasets/Wage.ipynb b/docs/jupyterbook/datasets/Wage.ipynb
index ad8f9b0..b95d853 100644
--- a/docs/jupyterbook/datasets/Wage.ipynb
+++ b/docs/jupyterbook/datasets/Wage.ipynb
@@ -99,9 +99,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Wage.md b/docs/jupyterbook/datasets/Wage.md
index eeeb3c4..fd22e30 100644
--- a/docs/jupyterbook/datasets/Wage.md
+++ b/docs/jupyterbook/datasets/Wage.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Mid-Atlantic Wage Data
diff --git a/docs/jupyterbook/datasets/Weekly.ipynb b/docs/jupyterbook/datasets/Weekly.ipynb
index cf08b80..69f26d6 100644
--- a/docs/jupyterbook/datasets/Weekly.ipynb
+++ b/docs/jupyterbook/datasets/Weekly.ipynb
@@ -95,9 +95,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/datasets/Weekly.md b/docs/jupyterbook/datasets/Weekly.md
index c0639ea..c239c5e 100644
--- a/docs/jupyterbook/datasets/Weekly.md
+++ b/docs/jupyterbook/datasets/Weekly.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Weekly S&P Stock Market Data
diff --git a/docs/jupyterbook/helpers/cluster.ipynb b/docs/jupyterbook/helpers/cluster.ipynb
index bf237a3..31798a0 100644
--- a/docs/jupyterbook/helpers/cluster.ipynb
+++ b/docs/jupyterbook/helpers/cluster.ipynb
@@ -8,15 +8,27 @@
"# Clustering\n",
"\n",
"This module has a single function, used to help visualize a dendrogram from a\n",
- "hierarchical clustering."
+ "hierarchical clustering. The function is based on this example from [sklearn.cluster](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html)."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "d5df152d",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'sklearn'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcluster\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AgglomerativeClustering\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcluster\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhierarchy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dendrogram\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mISLP\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcluster\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compute_linkage\n",
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'"
+ ]
+ }
+ ],
"source": [
"import numpy as np\n",
"from sklearn.cluster import AgglomerativeClustering\n",
@@ -34,7 +46,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "0135c1fb",
"metadata": {},
"outputs": [],
@@ -101,9 +113,21 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.10"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/helpers/cluster.md b/docs/jupyterbook/helpers/cluster.md
index ab31348..b951b18 100644
--- a/docs/jupyterbook/helpers/cluster.md
+++ b/docs/jupyterbook/helpers/cluster.md
@@ -7,19 +7,19 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: Python 3 (ipykernel)
language: python
- name: islp_test
+ name: python3
---
# Clustering
This module has a single function, used to help visualize a dendrogram from a
-hierarchical clustering.
+hierarchical clustering. The function is based on this example from [sklearn.cluster](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html).
-```{code-cell}
+```{code-cell} ipython3
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
@@ -28,7 +28,7 @@ from ISLP.cluster import compute_linkage
## Make a toy dataset
-```{code-cell}
+```{code-cell} ipython3
rng = np.random.default_rng(1)
X = rng.normal(size=(30, 5))
X[:10] += 1
@@ -36,19 +36,19 @@ X[:10] += 1
## Cluster it
-```{code-cell}
+```{code-cell} ipython3
clust = AgglomerativeClustering(distance_threshold=0,
n_clusters=None,
linkage='complete')
```
-```{code-cell}
+```{code-cell} ipython3
clust.fit(X)
```
## Plot the dendrogram
-```{code-cell}
+```{code-cell} ipython3
linkage = compute_linkage(clust)
dendrogram(linkage);
```
diff --git a/docs/jupyterbook/helpers/pygam.ipynb b/docs/jupyterbook/helpers/pygam.ipynb
index 01a1e55..aab61d1 100644
--- a/docs/jupyterbook/helpers/pygam.ipynb
+++ b/docs/jupyterbook/helpers/pygam.ipynb
@@ -207,9 +207,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/helpers/pygam.md b/docs/jupyterbook/helpers/pygam.md
index c91084c..56adc84 100644
--- a/docs/jupyterbook/helpers/pygam.md
+++ b/docs/jupyterbook/helpers/pygam.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Generalized Additive Models
diff --git a/docs/jupyterbook/helpers/survival.ipynb b/docs/jupyterbook/helpers/survival.ipynb
index e6b9e3a..7cb30a3 100644
--- a/docs/jupyterbook/helpers/survival.ipynb
+++ b/docs/jupyterbook/helpers/survival.ipynb
@@ -108,9 +108,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/helpers/survival.md b/docs/jupyterbook/helpers/survival.md
index 715f8bd..58b129d 100644
--- a/docs/jupyterbook/helpers/survival.md
+++ b/docs/jupyterbook/helpers/survival.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Survival Analysis
diff --git a/docs/jupyterbook/helpers/svm.ipynb b/docs/jupyterbook/helpers/svm.ipynb
index dac6c39..593d840 100644
--- a/docs/jupyterbook/helpers/svm.ipynb
+++ b/docs/jupyterbook/helpers/svm.ipynb
@@ -103,9 +103,9 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/helpers/svm.md b/docs/jupyterbook/helpers/svm.md
index 007eb7a..3025490 100644
--- a/docs/jupyterbook/helpers/svm.md
+++ b/docs/jupyterbook/helpers/svm.md
@@ -7,11 +7,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Support Vector Machines
diff --git a/docs/jupyterbook/imdb.ipynb b/docs/jupyterbook/imdb.ipynb
index d490921..ae0d7dd 100644
--- a/docs/jupyterbook/imdb.ipynb
+++ b/docs/jupyterbook/imdb.ipynb
@@ -5,71 +5,109 @@
"id": "50f2b809",
"metadata": {},
"source": [
- "# Creating a clean IMDB dataset\n",
+ "# Creating IMDB dataset from `keras` version\n",
+ "\n",
+ "This script details how the `IMDB` data in `ISLP` was constructed.\n",
"\n",
"Running this example requires `keras`. Use `pip install keras` to install if necessary."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "d920bb2e",
"metadata": {},
"outputs": [],
"source": [
- "import pickle"
+ "import pickle\n",
+ "import numpy as np\n",
+ "from scipy.sparse import coo_matrix, save_npz\n",
+ "import torch\n",
+ "from keras.datasets import imdb\n",
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "e507f1fb",
+ "cell_type": "markdown",
+ "id": "eaf27f0c-0cb0-4ad5-8775-d138e3f20933",
"metadata": {},
- "outputs": [],
"source": [
- "import numpy as np\n",
- "from scipy.sparse import coo_matrix, save_npz\n",
- "import torch"
+ "We first load the data using `keras`, limiting focus to the 10000 most commmon words."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b94d3f35",
+ "execution_count": 2,
+ "id": "29f0e01e",
"metadata": {},
"outputs": [],
"source": [
- "from keras.datasets import imdb\n",
- "from tensorflow.keras.preprocessing.sequence import pad_sequences"
+ "# the 3 is for three terms: \n",
+ "num_words = 10000+3\n",
+ "((S_train, L_train), \n",
+ " (S_test, L_test)) = imdb.load_data(num_words=num_words)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9020ab27-cc62-4b86-85ba-80a94ff692de",
+ "metadata": {},
+ "source": [
+ "The object `S_train` is effectively a list in which each document has been encoded into a sequence of\n",
+ "values from 0 to 10002."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "29f0e01e",
+ "execution_count": 3,
+ "id": "e27564c4-320f-42b6-9f2e-2a2afdebefcf",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# the 3 is for three terms: \n",
- "num_words = 10000+3\n",
- "((S_train, Y_train), \n",
- " (S_test, Y_test)) = imdb.load_data(num_words=num_words)"
+ "S_train[0][:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15f039fe-faed-4884-a725-1c51d6c8d4d4",
+ "metadata": {},
+ "source": [
+ "We'll use `np.float32` as that is the common precision used in `torch`."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "6cc3c3cb",
"metadata": {},
"outputs": [],
"source": [
- "Y_train = Y_train.astype(np.float32)\n",
- "Y_test = Y_test.astype(np.float32)"
+ "L_train = L_train.astype(np.float32)\n",
+ "L_test = L_test.astype(np.float32)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "005679bc-4337-4757-831e-f9a6ea50f6aa",
+ "metadata": {},
+ "source": [
+ "We will use a one-hot encoding that captures whether or not a given word appears in a given review."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"id": "7b6d1098",
"metadata": {},
"outputs": [],
@@ -88,18 +126,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "afcdc8b2",
"metadata": {},
"outputs": [],
"source": [
- "X_train, L_train = one_hot(S_train, num_words), Y_train\n",
+ "X_train = one_hot(S_train, num_words)\n",
"X_test = one_hot(S_test, num_words)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "a67e299d-8774-4758-8953-77afdce775ab",
+ "metadata": {},
+ "source": [
+ "## Store as sparse tensors\n",
+ "\n",
+ "We see later in the lab that the dense representation is faster. Nevertheless,\n",
+ "let's store the one-hot representation as sparse `torch` tensors \n",
+ "as well as sparse `scipy` matrices."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "b19366ea",
"metadata": {},
"outputs": [],
@@ -115,7 +165,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "b45ae6d1",
"metadata": {},
"outputs": [],
@@ -126,7 +176,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "a47d6eb6",
"metadata": {},
"outputs": [],
@@ -137,7 +187,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "d1b37b37",
"metadata": {},
"outputs": [],
@@ -151,12 +201,12 @@
"id": "1119823a",
"metadata": {},
"source": [
- "save the sparse matrices"
+ "### Save as sparse `scipy` matrices"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "6cb6bfdf",
"metadata": {},
"outputs": [],
@@ -167,12 +217,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "eac1c2ae",
"metadata": {},
"outputs": [],
"source": [
- "np.save('IMDB_Y_test.npy', Y_test)\n",
+ "np.save('IMDB_Y_test.npy', L_test)\n",
"np.save('IMDB_Y_train.npy', L_train)"
]
},
@@ -181,12 +231,14 @@
"id": "25c128e3",
"metadata": {},
"source": [
- "save and pickle the word index"
+ "## Save and pickle the word index\n",
+ "\n",
+ "We'll also want to store a lookup table to convert representations such as `S_train[0]` into words"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "8458bf67",
"metadata": {},
"outputs": [],
@@ -199,9 +251,46 @@
"lookup[4] = \"\""
]
},
+ {
+ "cell_type": "markdown",
+ "id": "5e62ebff-2575-4d35-b46c-51c6f7598efc",
+ "metadata": {},
+ "source": [
+ "Let's look at our first training document:"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
+ "id": "2aaefdf8-0a49-4bdb-8b40-55665283c8a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\" this film was just brilliant casting location scenery story direction everyone's really suited part they played and you\""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "' '.join([lookup[i] for i in S_train[0][:20]])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e985a73-bfd9-42bd-a523-3dc6e223d602",
+ "metadata": {},
+ "source": [
+ "We save this lookup table so it can be loaded later "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
"id": "d95252de",
"metadata": {},
"outputs": [],
@@ -214,12 +303,15 @@
"id": "b3d900b9",
"metadata": {},
"source": [
- "create the padded representations"
+ "## Padded representations\n",
+ "\n",
+ "For some of the recurrent models, we'll need sequences of common lengths, padded if necessary.\n",
+ "Here, we pad up to a maximum length of 500, filling the remaining entries with 0."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "637b3c5e",
"metadata": {},
"outputs": [],
@@ -230,9 +322,17 @@
" S_test]]"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "a6218300-b355-44cc-b7fb-4bff81211aa6",
+ "metadata": {},
+ "source": [
+ "Finally, we save these for later use in the deep learning lab."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"id": "bac69f88",
"metadata": {},
"outputs": [],
@@ -249,9 +349,21 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/imdb.md b/docs/jupyterbook/imdb.md
index 313952f..0b87bae 100644
--- a/docs/jupyterbook/imdb.md
+++ b/docs/jupyterbook/imdb.md
@@ -7,45 +7,54 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
-# Creating a clean IMDB dataset
+# Creating IMDB dataset from `keras` version
+
+This script details how the `IMDB` data in `ISLP` was constructed.
Running this example requires `keras`. Use `pip install keras` to install if necessary.
-```{code-cell}
+```{code-cell} ipython3
import pickle
-```
-
-```{code-cell}
import numpy as np
from scipy.sparse import coo_matrix, save_npz
import torch
-```
-
-```{code-cell}
from keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
```
-```{code-cell}
+We first load the data using `keras`, limiting focus to the 10000 most commmon words.
+
+```{code-cell} ipython3
# the 3 is for three terms:
num_words = 10000+3
-((S_train, Y_train),
- (S_test, Y_test)) = imdb.load_data(num_words=num_words)
+((S_train, L_train),
+ (S_test, L_test)) = imdb.load_data(num_words=num_words)
```
-```{code-cell}
-Y_train = Y_train.astype(np.float32)
-Y_test = Y_test.astype(np.float32)
+The object `S_train` is effectively a list in which each document has been encoded into a sequence of
+values from 0 to 10002.
+
+```{code-cell} ipython3
+S_train[0][:10]
+```
+
+We'll use `np.float32` as that is the common precision used in `torch`.
+
+```{code-cell} ipython3
+L_train = L_train.astype(np.float32)
+L_test = L_test.astype(np.float32)
```
-```{code-cell}
+We will use a one-hot encoding that captures whether or not a given word appears in a given review.
+
+```{code-cell} ipython3
def one_hot(sequences, ncol):
idx, vals = [], []
for i, s in enumerate(sequences):
@@ -58,12 +67,18 @@ def one_hot(sequences, ncol):
return tens.coalesce()
```
-```{code-cell}
-X_train, L_train = one_hot(S_train, num_words), Y_train
+```{code-cell} ipython3
+X_train = one_hot(S_train, num_words)
X_test = one_hot(S_test, num_words)
```
-```{code-cell}
+## Store as sparse tensors
+
+We see later in the lab that the dense representation is faster. Nevertheless,
+let's store the one-hot representation as sparse `torch` tensors
+as well as sparse `scipy` matrices.
+
+```{code-cell} ipython3
def convert_sparse_tensor(X):
idx = np.asarray(X.indices())
vals = np.asarray(X.values())
@@ -73,36 +88,38 @@ def convert_sparse_tensor(X):
shape=X.shape).tocsr()
```
-```{code-cell}
+```{code-cell} ipython3
X_train_s = convert_sparse_tensor(X_train)
X_test_s = convert_sparse_tensor(X_test)
```
-```{code-cell}
+```{code-cell} ipython3
X_train_d = torch.tensor(X_train_s.todense())
X_test_d = torch.tensor(X_test_s.todense())
```
-```{code-cell}
+```{code-cell} ipython3
torch.save(X_train_d, 'IMDB_X_train.tensor')
torch.save(X_test_d, 'IMDB_X_test.tensor')
```
-save the sparse matrices
+### Save as sparse `scipy` matrices
-```{code-cell}
+```{code-cell} ipython3
save_npz('IMDB_X_test.npz', X_test_s)
save_npz('IMDB_X_train.npz', X_train_s)
```
-```{code-cell}
-np.save('IMDB_Y_test.npy', Y_test)
+```{code-cell} ipython3
+np.save('IMDB_Y_test.npy', L_test)
np.save('IMDB_Y_train.npy', L_train)
```
-save and pickle the word index
+## Save and pickle the word index
-```{code-cell}
+We'll also want to store a lookup table to convert representations such as `S_train[0]` into words
+
+```{code-cell} ipython3
word_index = imdb.get_word_index()
lookup = {(i+3):w for w, i in word_index.items()}
lookup[0] = ""
@@ -111,20 +128,33 @@ lookup[2] = ""
lookup[4] = ""
```
-```{code-cell}
+Let's look at our first training document:
+
+```{code-cell} ipython3
+' '.join([lookup[i] for i in S_train[0][:20]])
+```
+
+We save this lookup table so it can be loaded later
+
+```{code-cell} ipython3
pickle.dump(lookup, open('IMDB_word_index.pkl', 'bw'))
```
-create the padded representations
+## Padded representations
-```{code-cell}
+For some of the recurrent models, we'll need sequences of common lengths, padded if necessary.
+Here, we pad up to a maximum length of 500, filling the remaining entries with 0.
+
+```{code-cell} ipython3
(S_train,
S_test) = [torch.tensor(pad_sequences(S, maxlen=500, value=0))
for S in [S_train,
S_test]]
```
-```{code-cell}
+Finally, we save these for later use in the deep learning lab.
+
+```{code-cell} ipython3
torch.save(S_train, 'IMDB_S_train.tensor')
torch.save(S_test, 'IMDB_S_test.tensor')
```
diff --git a/docs/jupyterbook/models/anova.ipynb b/docs/jupyterbook/models/anova.ipynb
new file mode 100644
index 0000000..41e8bcb
--- /dev/null
+++ b/docs/jupyterbook/models/anova.ipynb
@@ -0,0 +1,648 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ee33d364",
+ "metadata": {},
+ "source": [
+ "# ANOVA using `ModelSpec`\n",
+ "\n",
+ "\n",
+ "In this lab we illustrate how to run create specific ANOVA analyses\n",
+ "using `ModelSpec`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4c70fbaa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from statsmodels.api import OLS\n",
+ "from statsmodels.stats.anova import anova_lm\n",
+ "\n",
+ "from ISLP import load_data\n",
+ "from ISLP.models import (ModelSpec,\n",
+ " derived_feature,\n",
+ " summarize)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "333a49cf",
+ "metadata": {},
+ "source": [
+ "We will carry out two simple ANOVA analyses of the `Hitters` data.\n",
+ "We wish to predict a baseball player’s `Salary` on the\n",
+ "basis of various statistics associated with performance in the\n",
+ "previous year."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8a708215",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "59"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Hitters = load_data('Hitters')\n",
+ "np.isnan(Hitters['Salary']).sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dad5e991",
+ "metadata": {},
+ "source": [
+ " \n",
+ " We see that `Salary` is missing for 59 players. The\n",
+ "`dropna()` method of data frames removes all of the rows that have missing\n",
+ "values in any variable (by default --- see `Hitters.dropna?`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "ac7086a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n",
+ " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n",
+ " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Hitters = Hitters.dropna()\n",
+ "Hitters.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a0a3521-be74-40df-a404-3895d80a11dc",
+ "metadata": {},
+ "source": [
+ "## Grouping variables\n",
+ "\n",
+ "A look at the [description](https://islp.readthedocs.io/en/latest/datasets/Hitters.html) of the data shows\n",
+ "that there are both career and 1986 offensive stats, as well as some defensive stats.\n",
+ "\n",
+ "Let's group the offensive into recent and career offensive stats, as well as a group of defensive variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a215e43b-7bc8-4bdd-91cf-40d717cd7978",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "confounders = derived_feature(['Division', 'League', 'NewLeague'],\n",
+ " name='confounders')\n",
+ "offense_career = derived_feature(['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'],\n",
+ " name='offense_career')\n",
+ "offense_1986 = derived_feature(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks'],\n",
+ " name='offense_1986')\n",
+ "defense_1986 = derived_feature(['PutOuts', 'Assists', 'Errors'],\n",
+ " name='defense_1986')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa15fd0c-1e8a-431e-8425-c61da8439976",
+ "metadata": {},
+ "source": [
+ "We'll first do a sequential ANOVA where terms are added sequentially"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "40cd6c28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "design = ModelSpec([confounders, offense_career, defense_1986, offense_1986]).fit(Hitters)\n",
+ "Y = np.array(Hitters['Salary'])\n",
+ "X = design.transform(Hitters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "074120b1",
+ "metadata": {},
+ "source": [
+ "Along with a score we need to specify the search strategy. This is done through the object\n",
+ "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n",
+ "runs forward stepwise until any further additions to the model do not result\n",
+ "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n",
+ "runs a fixed number of steps of stepwise search."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e65f5607",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " coef | \n",
+ " std err | \n",
+ " t | \n",
+ " P>|t| | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | intercept | \n",
+ " 148.2187 | \n",
+ " 73.595 | \n",
+ " 2.014 | \n",
+ " 0.045 | \n",
+ "
\n",
+ " \n",
+ " | Division[W] | \n",
+ " -116.0404 | \n",
+ " 40.188 | \n",
+ " -2.887 | \n",
+ " 0.004 | \n",
+ "
\n",
+ " \n",
+ " | League[N] | \n",
+ " 63.7503 | \n",
+ " 79.006 | \n",
+ " 0.807 | \n",
+ " 0.421 | \n",
+ "
\n",
+ " \n",
+ " | NewLeague[N] | \n",
+ " -24.3989 | \n",
+ " 78.843 | \n",
+ " -0.309 | \n",
+ " 0.757 | \n",
+ "
\n",
+ " \n",
+ " | CAtBat | \n",
+ " -0.1887 | \n",
+ " 0.120 | \n",
+ " -1.572 | \n",
+ " 0.117 | \n",
+ "
\n",
+ " \n",
+ " | CHits | \n",
+ " 0.1636 | \n",
+ " 0.665 | \n",
+ " 0.246 | \n",
+ " 0.806 | \n",
+ "
\n",
+ " \n",
+ " | CHmRun | \n",
+ " -0.1517 | \n",
+ " 1.612 | \n",
+ " -0.094 | \n",
+ " 0.925 | \n",
+ "
\n",
+ " \n",
+ " | CRuns | \n",
+ " 1.4716 | \n",
+ " 0.747 | \n",
+ " 1.971 | \n",
+ " 0.050 | \n",
+ "
\n",
+ " \n",
+ " | CRBI | \n",
+ " 0.8021 | \n",
+ " 0.691 | \n",
+ " 1.161 | \n",
+ " 0.247 | \n",
+ "
\n",
+ " \n",
+ " | CWalks | \n",
+ " -0.8124 | \n",
+ " 0.327 | \n",
+ " -2.481 | \n",
+ " 0.014 | \n",
+ "
\n",
+ " \n",
+ " | PutOuts | \n",
+ " 0.2827 | \n",
+ " 0.077 | \n",
+ " 3.661 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " | Assists | \n",
+ " 0.3755 | \n",
+ " 0.220 | \n",
+ " 1.705 | \n",
+ " 0.089 | \n",
+ "
\n",
+ " \n",
+ " | Errors | \n",
+ " -3.2940 | \n",
+ " 4.377 | \n",
+ " -0.753 | \n",
+ " 0.452 | \n",
+ "
\n",
+ " \n",
+ " | AtBat | \n",
+ " -1.9509 | \n",
+ " 0.624 | \n",
+ " -3.125 | \n",
+ " 0.002 | \n",
+ "
\n",
+ " \n",
+ " | Hits | \n",
+ " 7.4395 | \n",
+ " 2.363 | \n",
+ " 3.148 | \n",
+ " 0.002 | \n",
+ "
\n",
+ " \n",
+ " | HmRun | \n",
+ " 4.3449 | \n",
+ " 6.190 | \n",
+ " 0.702 | \n",
+ " 0.483 | \n",
+ "
\n",
+ " \n",
+ " | Runs | \n",
+ " -2.3312 | \n",
+ " 2.971 | \n",
+ " -0.785 | \n",
+ " 0.433 | \n",
+ "
\n",
+ " \n",
+ " | RBI | \n",
+ " -1.0670 | \n",
+ " 2.595 | \n",
+ " -0.411 | \n",
+ " 0.681 | \n",
+ "
\n",
+ " \n",
+ " | Walks | \n",
+ " 6.2196 | \n",
+ " 1.825 | \n",
+ " 3.409 | \n",
+ " 0.001 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " coef std err t P>|t|\n",
+ "intercept 148.2187 73.595 2.014 0.045\n",
+ "Division[W] -116.0404 40.188 -2.887 0.004\n",
+ "League[N] 63.7503 79.006 0.807 0.421\n",
+ "NewLeague[N] -24.3989 78.843 -0.309 0.757\n",
+ "CAtBat -0.1887 0.120 -1.572 0.117\n",
+ "CHits 0.1636 0.665 0.246 0.806\n",
+ "CHmRun -0.1517 1.612 -0.094 0.925\n",
+ "CRuns 1.4716 0.747 1.971 0.050\n",
+ "CRBI 0.8021 0.691 1.161 0.247\n",
+ "CWalks -0.8124 0.327 -2.481 0.014\n",
+ "PutOuts 0.2827 0.077 3.661 0.000\n",
+ "Assists 0.3755 0.220 1.705 0.089\n",
+ "Errors -3.2940 4.377 -0.753 0.452\n",
+ "AtBat -1.9509 0.624 -3.125 0.002\n",
+ "Hits 7.4395 2.363 3.148 0.002\n",
+ "HmRun 4.3449 6.190 0.702 0.483\n",
+ "Runs -2.3312 2.971 -0.785 0.433\n",
+ "RBI -1.0670 2.595 -0.411 0.681\n",
+ "Walks 6.2196 1.825 3.409 0.001"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "M = OLS(Y, X).fit()\n",
+ "summarize(M)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29d9b55f",
+ "metadata": {},
+ "source": [
+ "We'll first produce the sequential, or Type I ANOVA results. This builds up a model sequentially and compares\n",
+ "two successive models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "cfbe5b92",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " df_resid | \n",
+ " ssr | \n",
+ " df_diff | \n",
+ " ss_diff | \n",
+ " F | \n",
+ " Pr(>F) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | intercept | \n",
+ " 262.0 | \n",
+ " 5.331911e+07 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | confounders | \n",
+ " 259.0 | \n",
+ " 5.131263e+07 | \n",
+ " 3.0 | \n",
+ " 2.006478e+06 | \n",
+ " 6.741147 | \n",
+ " 2.144265e-04 | \n",
+ "
\n",
+ " \n",
+ " | offense_career | \n",
+ " 253.0 | \n",
+ " 3.059130e+07 | \n",
+ " 6.0 | \n",
+ " 2.072134e+07 | \n",
+ " 34.808656 | \n",
+ " 1.470455e-30 | \n",
+ "
\n",
+ " \n",
+ " | defense_1986 | \n",
+ " 250.0 | \n",
+ " 2.730614e+07 | \n",
+ " 3.0 | \n",
+ " 3.285156e+06 | \n",
+ " 11.037111 | \n",
+ " 7.880207e-07 | \n",
+ "
\n",
+ " \n",
+ " | offense_1986 | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 6.0 | \n",
+ " 3.097572e+06 | \n",
+ " 5.203444 | \n",
+ " 4.648586e-05 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " df_resid ssr df_diff ss_diff F \\\n",
+ "intercept 262.0 5.331911e+07 0.0 NaN NaN \n",
+ "confounders 259.0 5.131263e+07 3.0 2.006478e+06 6.741147 \n",
+ "offense_career 253.0 3.059130e+07 6.0 2.072134e+07 34.808656 \n",
+ "defense_1986 250.0 2.730614e+07 3.0 3.285156e+06 11.037111 \n",
+ "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n",
+ "\n",
+ " Pr(>F) \n",
+ "intercept NaN \n",
+ "confounders 2.144265e-04 \n",
+ "offense_career 1.470455e-30 \n",
+ "defense_1986 7.880207e-07 \n",
+ "offense_1986 4.648586e-05 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = anova_lm(*[OLS(Y, D).fit() for D in design.build_sequence(Hitters, anova_type='sequential')])\n",
+ "df.index = design.names\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7092f666",
+ "metadata": {},
+ "source": [
+ "We can similarly compute the Type II ANOVA results which drops each term and compares to the full model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "e2d43844",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " df_resid | \n",
+ " ssr | \n",
+ " df_diff | \n",
+ " ss_diff | \n",
+ " F | \n",
+ " Pr(>F) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | intercept | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 1.0 | \n",
+ " 4.024254e+05 | \n",
+ " 4.056076 | \n",
+ " 4.511037e-02 | \n",
+ "
\n",
+ " \n",
+ " | confounders | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 3.0 | \n",
+ " 9.661738e+05 | \n",
+ " 3.246046 | \n",
+ " 2.261572e-02 | \n",
+ "
\n",
+ " \n",
+ " | offense_career | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 6.0 | \n",
+ " 1.051025e+07 | \n",
+ " 17.655596 | \n",
+ " 5.701196e-17 | \n",
+ "
\n",
+ " \n",
+ " | defense_1986 | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 3.0 | \n",
+ " 1.467933e+06 | \n",
+ " 4.931803 | \n",
+ " 2.415732e-03 | \n",
+ "
\n",
+ " \n",
+ " | offense_1986 | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 6.0 | \n",
+ " 3.097572e+06 | \n",
+ " 5.203444 | \n",
+ " 4.648586e-05 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " df_resid ssr df_diff ss_diff F \\\n",
+ "intercept 244.0 2.420857e+07 1.0 4.024254e+05 4.056076 \n",
+ "confounders 244.0 2.420857e+07 3.0 9.661738e+05 3.246046 \n",
+ "offense_career 244.0 2.420857e+07 6.0 1.051025e+07 17.655596 \n",
+ "defense_1986 244.0 2.420857e+07 3.0 1.467933e+06 4.931803 \n",
+ "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n",
+ "\n",
+ " Pr(>F) \n",
+ "intercept 4.511037e-02 \n",
+ "confounders 2.261572e-02 \n",
+ "offense_career 5.701196e-17 \n",
+ "defense_1986 2.415732e-03 \n",
+ "offense_1986 4.648586e-05 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "D_full = design.transform(Hitters)\n",
+ "OLS_full = OLS(Y, D_full).fit()\n",
+ "dfs = []\n",
+ "for d in design.build_sequence(Hitters, anova_type='drop'):\n",
+ " dfs.append(anova_lm(OLS(Y,d).fit(), OLS_full).iloc[1:])\n",
+ "df = pd.concat(dfs)\n",
+ "df.index = design.names\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "362709ae-9558-4c4c-8f5e-f8388caf631d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
+ },
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/jupyterbook/models/anova.md b/docs/jupyterbook/models/anova.md
new file mode 100644
index 0000000..574f9eb
--- /dev/null
+++ b/docs/jupyterbook/models/anova.md
@@ -0,0 +1,115 @@
+---
+jupytext:
+ formats: source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb
+ text_representation:
+ extension: .md
+ format_name: myst
+ format_version: 0.13
+ jupytext_version: 1.14.5
+kernelspec:
+ display_name: python3
+ language: python
+ name: python3
+---
+
+# ANOVA using `ModelSpec`
+
+
+In this lab we illustrate how to run create specific ANOVA analyses
+using `ModelSpec`.
+
+```{code-cell} ipython3
+import numpy as np
+import pandas as pd
+
+from statsmodels.api import OLS
+from statsmodels.stats.anova import anova_lm
+
+from ISLP import load_data
+from ISLP.models import (ModelSpec,
+ derived_feature,
+ summarize)
+```
+
+We will carry out two simple ANOVA analyses of the `Hitters` data.
+We wish to predict a baseball player’s `Salary` on the
+basis of various statistics associated with performance in the
+previous year.
+
+```{code-cell} ipython3
+Hitters = load_data('Hitters')
+np.isnan(Hitters['Salary']).sum()
+```
+
+
+ We see that `Salary` is missing for 59 players. The
+`dropna()` method of data frames removes all of the rows that have missing
+values in any variable (by default --- see `Hitters.dropna?`).
+
+```{code-cell} ipython3
+Hitters = Hitters.dropna()
+Hitters.columns
+```
+
+## Grouping variables
+
+A look at the [description](https://islp.readthedocs.io/en/latest/datasets/Hitters.html) of the data shows
+that there are both career and 1986 offensive stats, as well as some defensive stats.
+
+Let's group the offensive into recent and career offensive stats, as well as a group of defensive variables.
+
+```{code-cell} ipython3
+confounders = derived_feature(['Division', 'League', 'NewLeague'],
+ name='confounders')
+offense_career = derived_feature(['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'],
+ name='offense_career')
+offense_1986 = derived_feature(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks'],
+ name='offense_1986')
+defense_1986 = derived_feature(['PutOuts', 'Assists', 'Errors'],
+ name='defense_1986')
+```
+
+We'll first do a sequential ANOVA where terms are added sequentially
+
+```{code-cell} ipython3
+design = ModelSpec([confounders, offense_career, defense_1986, offense_1986]).fit(Hitters)
+Y = np.array(Hitters['Salary'])
+X = design.transform(Hitters)
+```
+
+Along with a score we need to specify the search strategy. This is done through the object
+`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`
+runs forward stepwise until any further additions to the model do not result
+in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`
+runs a fixed number of steps of stepwise search.
+
+```{code-cell} ipython3
+M = OLS(Y, X).fit()
+summarize(M)
+```
+
+We'll first produce the sequential, or Type I ANOVA results. This builds up a model sequentially and compares
+two successive models.
+
+```{code-cell} ipython3
+df = anova_lm(*[OLS(Y, D).fit() for D in design.build_sequence(Hitters, anova_type='sequential')])
+df.index = design.names
+df
+```
+
+We can similarly compute the Type II ANOVA results which drops each term and compares to the full model.
+
+```{code-cell} ipython3
+D_full = design.transform(Hitters)
+OLS_full = OLS(Y, D_full).fit()
+dfs = []
+for d in design.build_sequence(Hitters, anova_type='drop'):
+ dfs.append(anova_lm(OLS(Y,d).fit(), OLS_full).iloc[1:])
+df = pd.concat(dfs)
+df.index = design.names
+df
+```
+
+```{code-cell} ipython3
+
+```
diff --git a/docs/jupyterbook/models/derived.ipynb b/docs/jupyterbook/models/derived.ipynb
deleted file mode 100644
index 92fc096..0000000
--- a/docs/jupyterbook/models/derived.ipynb
+++ /dev/null
@@ -1,2125 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "38217f02",
- "metadata": {},
- "source": [
- "# Building design matrices with `ModelSpec`\n",
- "\n",
- "Force rebuild"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "3107d1f9",
- "metadata": {},
- "outputs": [],
- "source": [
- "x=4\n",
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
- "\n",
- "from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
- "\n",
- "import statsmodels.api as sm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "cdc46a4e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n",
- " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
- "Carseats.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e0a2a83a",
- "metadata": {},
- "source": [
- "## Let's break up income into groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "68b40caf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "35558d88",
- "metadata": {},
- "source": [
- "Let's also create an unordered version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "e5e81a95",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4bbf9e13",
- "metadata": {},
- "source": [
- "## A simple model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "1ad729b3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "d05e9ec8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b4e9ee33",
- "metadata": {},
- "source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "64ac65d3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "620f0e01",
- "metadata": {},
- "source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "77b898e0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4580a6bf",
- "metadata": {},
- "source": [
- "It recognized ordinal columns as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "c2dab855",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['OIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "5e7963d6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6b689966",
- "metadata": {},
- "source": [
- "## Encoding a column\n",
- "\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "ff3b96b6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "7e87da20",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4f2030ac",
- "metadata": {},
- "source": [
- "## The terms\n",
- "\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "27fc4fb3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Price', 'Income']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "16316981",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ef3f2bd0",
- "metadata": {},
- "source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "dd9c7fa6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5fc4cc45",
- "metadata": {},
- "source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "49d7fb46",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bdfc0fe9",
- "metadata": {},
- "source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "cf6f3f4c",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1552d19a",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "12d955dd",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f5ea292d",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "ae2af29b",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "57305dbe",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "89656ec4",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f6cb8167",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "547cb625",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "ff5b41d5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " Price | \n",
- " Income | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "932759cf",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "e2190b00",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "6545c5da",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cd088b51",
- "metadata": {},
- "source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "8f37ae20",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "184aefc2",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "e4134980",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53808f3b",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "62059c57",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ded12f69",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "fbb509d1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f01391e4",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "10df55ae",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b43099fb",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "50bce64d",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "2eb2ff16",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "6686dff8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "0e0eafd7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "43cce209",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "99bf408e",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "11c19ebf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4b48e5d2",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "81f641ba",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "79f7eb4d",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "2afb3b5d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "c44692ab",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "c0bfb2a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "d263056c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "edf0dc68",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "82071a54",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "cd18a4a4",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "229fa32d",
- "metadata": {},
- "source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "b8c52dbb",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e7f93464",
- "metadata": {},
- "source": [
- "Comparing this to the previous `R` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "4094c01f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d448c9ca",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "634e05c6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "4c09c93f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "dtype: float64"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "48c1989f",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "85a28d87",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3)', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "e17c8a9d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3)[0] 10.036373\n",
- "poly(Income, 3)[1] -2.799156\n",
- "poly(Income, 3)[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "944f56d6",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "1889caca",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bd4dca31",
- "metadata": {},
- "source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "70fae990",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import ns, bs, pca"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2d812694",
- "metadata": {},
- "source": [
- "## Custom encoding\n",
- "\n",
- "Instead of PCA we might run some clustering on some features and then uses the clusters to\n",
- "create new features. This can be done with `derived_variable`. Indeed, `pca`, `ns` and `bs` are all examples\n",
- "of this."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "8e5d2305",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import derived_variable, Contrast"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "8a40c663",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 0, 2,\n",
- " 2, 2, 1, 2, 1, 0, 0, 1, 0, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2,\n",
- " 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 0, 1, 2,\n",
- " 0, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, 2, 2, 0, 1, 2,\n",
- " 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 2, 0, 0, 1, 2, 0, 1, 0,\n",
- " 0, 1, 0, 2, 0, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0,\n",
- " 0, 0, 2, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 0, 2, 1, 0, 0, 0, 1, 2, 2,\n",
- " 1, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 1, 0, 0, 1, 1, 1, 0, 0, 2, 0,\n",
- " 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 0, 2, 2, 1, 1, 2, 2, 0, 1, 1, 2, 2,\n",
- " 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 1,\n",
- " 1, 2, 0, 2, 2, 2, 2, 0, 1, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 0, 0,\n",
- " 2, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 1, 2, 1, 2,\n",
- " 2, 2, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 2,\n",
- " 1, 2, 2, 1, 1, 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0,\n",
- " 1, 0, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 0, 2, 1, 0, 1,\n",
- " 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 1,\n",
- " 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2, 1, 0, 2, 1, 1, 0, 2, 2,\n",
- " 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1,\n",
- " 2, 2, 0, 2], dtype=int32)"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.cluster import KMeans\n",
- "from sklearn.pipeline import make_pipeline\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "cluster = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n",
- "group = Variable(('Income', 'Price', 'Advertising', 'Population'), 'group', None)\n",
- "X = design.build_submodel(Carseats, [group]).drop('intercept', axis=1)\n",
- "cluster.fit(X.values)\n",
- "cluster.predict(X.values)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9bc38836",
- "metadata": {},
- "source": [
- "For clustering, we often want to use the `predict` method rather than the `transform` method. If the ultimate\n",
- "features all use `transform` then the do not even need to use these two calls to `make_pipeline`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "8ceab9b6",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " myclus | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " intercept myclus\n",
- "0 1.0 1\n",
- "1 1.0 1\n",
- "2 1.0 2\n",
- "3 1.0 1\n",
- "4 1.0 2\n",
- ".. ... ...\n",
- "395 1.0 1\n",
- "396 1.0 2\n",
- "397 1.0 2\n",
- "398 1.0 0\n",
- "399 1.0 2\n",
- "\n",
- "[400 rows x 2 columns]"
- ]
- },
- "execution_count": 52,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n",
- "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n",
- " name='myclus', \n",
- " encoder=cluster2,\n",
- " use_transform=False)\n",
- "design = ModelSpec([cluster_var]).fit(Carseats)\n",
- "design.transform(Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1f9b2630",
- "metadata": {},
- "source": [
- "Somewhat clunkily, we can make this a categorical variable by creating a `Variable` with a\n",
- "categorical encoder."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "ffde00a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=(Variable(variables=('Income', 'Price', 'Advertising', 'Population'), name='myclus', encoder=Pipeline(steps=[('standardscaler', StandardScaler()),\n",
- " ('kmeans', KMeans(n_clusters=3, random_state=0))]), use_transform=False, pure_columns=False, override_encoder_colnames=True),), name='mynewcat', encoder=Contrast(), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n",
- "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n",
- " name='myclus', \n",
- " encoder=cluster2,\n",
- " use_transform=False)\n",
- "cat_cluster = Variable((cluster_var,), name='mynewcat', encoder=Contrast(method='drop'))\n",
- "cat_cluster"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "5afeab7c",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " intercept 1 2\n",
- "0 1.0 1.0 0.0\n",
- "1 1.0 1.0 0.0\n",
- "2 1.0 0.0 1.0\n",
- "3 1.0 1.0 0.0\n",
- "4 1.0 0.0 1.0\n",
- ".. ... ... ...\n",
- "395 1.0 1.0 0.0\n",
- "396 1.0 0.0 1.0\n",
- "397 1.0 0.0 1.0\n",
- "398 1.0 0.0 0.0\n",
- "399 1.0 0.0 1.0\n",
- "\n",
- "[400 rows x 3 columns]"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([cat_cluster]).fit(Carseats)\n",
- "\n",
- "design.transform(Carseats)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e24d5637-80fb-49bf-ac10-8ff68cb8bd8f",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "jupytext": {
- "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
- },
- "kernelspec": {
- "display_name": "islp_test",
- "language": "python",
- "name": "islp_test"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/jupyterbook/models/derived.md b/docs/jupyterbook/models/derived.md
deleted file mode 100644
index 1d0f23b..0000000
--- a/docs/jupyterbook/models/derived.md
+++ /dev/null
@@ -1,487 +0,0 @@
----
-jupytext:
- formats: source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb
- text_representation:
- extension: .md
- format_name: myst
- format_version: 0.13
- jupytext_version: 1.14.1
-kernelspec:
- display_name: islp_test
- language: python
- name: islp_test
----
-
-# Building design matrices with `ModelSpec`
-
-Force rebuild
-
-```{code-cell} ipython3
-x=4
-import numpy as np, pandas as pd
-%load_ext rpy2.ipython
-
-from ISLP import load_data
-from ISLP.models import ModelSpec
-
-import statsmodels.api as sm
-```
-
-```{code-cell} ipython3
-Carseats = load_data('Carseats')
-%R -i Carseats
-Carseats.columns
-```
-
-## Let's break up income into groups
-
-```{code-cell} ipython3
-Carseats['OIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'])
-Carseats['OIncome']
-```
-
-Let's also create an unordered version
-
-```{code-cell} ipython3
-Carseats['UIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'],
- ordered=False)
-Carseats['UIncome']
-```
-
-## A simple model
-
-```{code-cell} ipython3
-design = ModelSpec(['Price', 'Income'])
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-Y = Carseats['Sales']
-M = sm.OLS(Y, X).fit()
-M.params
-```
-
-## Basic procedure
-
-The design matrix is built by cobbling together a set of columns and possibly transforming them.
-A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`
-is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:
-
-```{code-cell} ipython3
-Carseats['ShelveLoc']
-```
-
-This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods
-`get_columns` and `fit_encoder`.
-
-```{code-cell} ipython3
-design.column_info_['ShelveLoc']
-```
-
-It recognized ordinal columns as well.
-
-```{code-cell} ipython3
-design.column_info_['OIncome']
-```
-
-```{code-cell} ipython3
-income = design.column_info_['Income']
-cols, names = income.get_columns(Carseats)
-(cols[:4], names)
-```
-
-## Encoding a column
-
-In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical
-variables usually are encoded by several columns, typically one less than the number of categories.
-This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform
-model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits
-its encoder the first time data is passed to it.
-
-```{code-cell} ipython3
-shelve = design.column_info_['ShelveLoc']
-cols, names = shelve.get_columns(Carseats)
-(cols[:4], names)
-```
-
-```{code-cell} ipython3
-oincome = design.column_info_['OIncome']
-oincome.get_columns(Carseats)[0][:4]
-```
-
-## The terms
-
-The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through
-the `terms` argument which should be a sequence. The elements of `terms` are often
-going to be strings (or tuples of strings for interactions, see below) but are converted to a
-`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple.
-
-```{code-cell} ipython3
-design.terms
-```
-
-```{code-cell} ipython3
-design.terms_
-```
-
-While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A
-`Variable` can also create columns through the `build_columns` method of `ModelSpec`
-
-```{code-cell} ipython3
-price = design.terms_[0]
-design.build_columns(Carseats, price)
-```
-
-Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The
-tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then
-is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during
-the call to `ModelSpec.fit`.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import Variable
-
-new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)
-design.build_columns(Carseats, new_var)
-```
-
-Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the
-arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`.
-
-```{code-cell} ipython3
-from sklearn.decomposition import PCA
-pca = PCA(n_components=2)
-pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`
-pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)
-design.build_columns(Carseats, pca_var)
-```
-
-The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`)
-or `Variable` instances (`pca_var`).
-
-```{code-cell} ipython3
-fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)
-design.build_columns(Carseats, fancy_var)
-```
-
-We can of course run PCA again on these features (if we wanted).
-
-```{code-cell} ipython3
-pca2 = PCA(n_components=2)
-pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`
-pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)
-design.build_columns(Carseats, pca2_var)
-```
-
-## Building the design matrix
-
-With these notions in mind, the final design is essentially then
-
-```{code-cell} ipython3
-X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]
-```
-
-An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is
-a dataframe the index is adjusted accordingly.
-
-```{code-cell} ipython3
-design.intercept
-```
-
-```{code-cell} ipython3
-design.transform(Carseats)[:4]
-```
-
-## Predicting
-
-Constructing the design matrix at any values is carried out by the `transform` method.
-
-```{code-cell} ipython3
-new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})
-new_X = design.transform(new_data)
-M.get_prediction(new_X).predicted_mean
-```
-
-```{code-cell} ipython3
-%%R -i new_data,Carseats
-predict(lm(Sales ~ Price + Income, data=Carseats), new_data)
-```
-
-### Difference between using `pd.DataFrame` and `np.ndarray`
-
-If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.
-
-If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,
-in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning.
-
-```{code-cell} ipython3
-Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])
-design_np = ModelSpec([0,3]).fit(Carseats_np)
-design_np.transform(Carseats_np)[:4]
-```
-
-The following will fail for hopefully obvious reasons
-
-```{code-cell} ipython3
-try:
- new_D = np.zeros((2,2))
- new_D[:,0] = [10,20]
- new_D[:,1] = [40,50]
- M.get_prediction(new_D).predicted_mean
-except ValueError as e:
- print(e)
-```
-
-Ultimately, `M` expects 3 columns for new predictions because it was fit
-with a matrix having 3 columns (the first representing an intercept).
-
-We might be tempted to try as with the `pd.DataFrame` and produce
-an `np.ndarray` with only the necessary variables.
-
-```{code-cell} ipython3
-try:
- new_X = np.zeros((2,2))
- new_X[:,0] = [10,20]
- new_X[:,1] = [40,50]
- new_D = design_np.transform(new_X)
- M.get_prediction(new_D).predicted_mean
-except IndexError as e:
- print(e)
-```
-
-This fails because `design_np` is looking for column `3` from its `terms`:
-
-```{code-cell} ipython3
-design_np.terms_
-```
-
-However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed
-represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:
-
-```{code-cell} ipython3
-new_X = np.zeros((2,4))
-new_X[:,0] = [10,20]
-new_X[:,3] = [40,50]
-new_D = design_np.transform(new_X)
-M.get_prediction(new_D).predicted_mean
-```
-
-Given this subtlety about needing to supply arrays with identical column structure to `transform` when
-using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case.
-
-+++
-
-## A model with some categorical variables
-
-Categorical variables become `Column` instances with encoders.
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)
-design.column_info_['UIncome']
-```
-
-```{code-cell} ipython3
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Getting the encoding you want
-
-By default the level dropped by `ModelSpec` will be the first of the `categories_` values from
-`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems
-as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`
-where `new_encoder` would somehow drop the column we want dropped.
-
-However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:
-
-```{code-cell} ipython3
-design.column_info_['UIncome']
-```
-
-This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when
-we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest
-and we need a way to allow different encodings of the same column of `Carseats`
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
- We can create a new
-`Column` with the encoder we want. For categorical variables, there is a convenience function to do so.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import contrast
-pref_encoding = contrast('UIncome', 'drop', 'L')
-```
-
-```{code-cell} ipython3
-design.build_columns(Carseats, pref_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Interactions
-
-We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument
-to `ModelSpec`.
-
-```{code-cell} ipython3
-design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with
-`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`.
-
-```{code-cell} ipython3
-design.terms_[0]
-```
-
-Comparing this to the previous `R` model.
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
-We note a few important things:
-
-1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these
-columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**
-
-2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**
-
-A few notes:
-
-- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**
-
-- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily.
-
-```{code-cell} ipython3
-full_encoding = contrast('UIncome', None)
-design.build_columns(Carseats, full_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-## Special encodings
-
-For flexible models, we may want to consider transformations of features, i.e. polynomial
-or spline transformations. Given transforms that follow the `fit/transform` paradigm
-we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`
-package includes a `Poly` transform
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import poly
-poly('Income', 3)
-```
-
-```{code-cell} ipython3
-design = ModelSpec([poly('Income', 3), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-Compare:
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef
-```
-
-## Splines
-
-Support for natural and B-splines is also included
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import ns, bs, pca
-```
-
-## Custom encoding
-
-Instead of PCA we might run some clustering on some features and then uses the clusters to
-create new features. This can be done with `derived_variable`. Indeed, `pca`, `ns` and `bs` are all examples
-of this.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import derived_variable, Contrast
-```
-
-```{code-cell} ipython3
-from sklearn.cluster import KMeans
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
-cluster = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))
-group = Variable(('Income', 'Price', 'Advertising', 'Population'), 'group', None)
-X = design.build_submodel(Carseats, [group]).drop('intercept', axis=1)
-cluster.fit(X.values)
-cluster.predict(X.values)
-```
-
-For clustering, we often want to use the `predict` method rather than the `transform` method. If the ultimate
-features all use `transform` then the do not even need to use these two calls to `make_pipeline`.
-
-```{code-cell} ipython3
-cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))
-cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'],
- name='myclus',
- encoder=cluster2,
- use_transform=False)
-design = ModelSpec([cluster_var]).fit(Carseats)
-design.transform(Carseats)
-```
-
-Somewhat clunkily, we can make this a categorical variable by creating a `Variable` with a
-categorical encoder.
-
-```{code-cell} ipython3
-cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))
-cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'],
- name='myclus',
- encoder=cluster2,
- use_transform=False)
-cat_cluster = Variable((cluster_var,), name='mynewcat', encoder=Contrast(method='drop'))
-cat_cluster
-```
-
-```{code-cell} ipython3
-design = ModelSpec([cat_cluster]).fit(Carseats)
-
-design.transform(Carseats)
-```
-
-```{code-cell} ipython3
-
-```
diff --git a/docs/jupyterbook/models/selection.ipynb b/docs/jupyterbook/models/selection.ipynb
index b41cf6a..fd66d95 100644
--- a/docs/jupyterbook/models/selection.ipynb
+++ b/docs/jupyterbook/models/selection.ipynb
@@ -2,2723 +2,259 @@
"cells": [
{
"cell_type": "markdown",
- "id": "72bae06a",
+ "id": "247387ec-1477-42e6-9e69-cad1cacb5721",
"metadata": {},
"source": [
- "# Model selection using `ModelSpec`"
+ "# Model selection using `ModelSpec`\n",
+ "\n",
+ "\n",
+ "In this lab we illustrate how to run forward stepwise model selection\n",
+ "using the model specification capability of `ModelSpec`."
]
},
{
"cell_type": "code",
"execution_count": 1,
- "id": "ae6bd850",
+ "id": "4720bb2a-6bec-4e91-a57e-9689aa4f0532",
"metadata": {},
"outputs": [],
"source": [
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
- "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from statsmodels.api import OLS\n",
"from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
- "\n",
- "import statsmodels.api as sm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "5ac10e72",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n",
- " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
- "Carseats.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "80a586d9",
- "metadata": {},
- "source": [
- "## Let's break up income into groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "850356ba",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e24def3a",
- "metadata": {},
- "source": [
- "Let's also create an unordered version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "edf83080",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "aa22bb9c",
- "metadata": {},
- "source": [
- "## A simple model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "38d92522",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "cfc2056f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4674c345",
- "metadata": {},
- "source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "5688f0ad",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['ShelveLoc']"
+ "from ISLP.models import (ModelSpec,\n",
+ " Stepwise,\n",
+ " sklearn_selected)"
]
},
{
"cell_type": "markdown",
- "id": "4ae28ffa",
+ "id": "1c224240-ce8b-47f3-a85a-052c43038b26",
"metadata": {},
"source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
+ "### Forward Selection\n",
+ " \n",
+ "We will apply the forward-selection approach to the `Hitters` \n",
+ "data. We wish to predict a baseball player’s `Salary` on the\n",
+ "basis of various statistics associated with performance in the\n",
+ "previous year."
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "5f8926fd",
+ "execution_count": 2,
+ "id": "2adc66cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
+ "59"
]
},
- "execution_count": 8,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design.column_info_['ShelveLoc']"
+ "Hitters = load_data('Hitters')\n",
+ "np.isnan(Hitters['Salary']).sum()"
]
},
{
"cell_type": "markdown",
- "id": "966f53a5",
- "metadata": {},
- "source": [
- "It recognized ordinal columns as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "a137fa1e",
+ "id": "40c9a484",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "design.column_info_['OIncome']"
+ " \n",
+ " We see that `Salary` is missing for 59 players. The\n",
+ "`dropna()` method of data frames removes all of the rows that have missing\n",
+ "values in any variable (by default --- see `Hitters.dropna?`)."
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "3390dcb0",
+ "execution_count": 3,
+ "id": "1869fdab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
+ "(263, 20)"
]
},
- "execution_count": 10,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
+ "Hitters = Hitters.dropna()\n",
+ "Hitters.shape"
]
},
{
"cell_type": "markdown",
- "id": "b6667415",
- "metadata": {},
- "source": [
- "## Encoding a column\n",
- "\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "a1b42dbd",
+ "id": "0a1fe9e6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
+ "We first choose the best model using forward selection based on AIC. This score\n",
+ "is not built in as a metric to `sklearn`. We therefore define a function to compute it ourselves, and use\n",
+ "it as a scorer. By default, `sklearn` tries to maximize a score, hence\n",
+ " our scoring function computes the negative AIC statistic."
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "31367988",
+ "execution_count": 4,
+ "id": "76bd8110",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
+ "def negAIC(estimator, X, Y):\n",
+ " \"Negative AIC\"\n",
+ " n, p = X.shape\n",
+ " Yhat = estimator.predict(X)\n",
+ " MSE = np.mean((Y - Yhat)**2)\n",
+ " return n + n * np.log(MSE) + 2 * (p + 1)\n",
+ " "
]
},
{
"cell_type": "markdown",
- "id": "751c1487",
- "metadata": {},
- "source": [
- "## The terms\n",
- "\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "6e2b6155",
+ "id": "14ba6f49",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Price', 'Income']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "design.terms"
+ "We need to estimate the residual variance $\\sigma^2$, which is the first argument in our scoring function above.\n",
+ "We will fit the biggest model, using all the variables, and estimate $\\sigma^2$ based on its MSE."
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "d3e669da",
+ "execution_count": 5,
+ "id": "94e10f35",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "design.terms_"
+ "design = ModelSpec(Hitters.columns.drop('Salary')).fit(Hitters)\n",
+ "Y = np.array(Hitters['Salary'])\n",
+ "X = design.transform(Hitters)"
]
},
{
"cell_type": "markdown",
- "id": "fb0a45c9",
+ "id": "afdda5f2",
"metadata": {},
"source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
+ "Along with a score we need to specify the search strategy. This is done through the object\n",
+ "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n",
+ "runs forward stepwise until any further additions to the model do not result\n",
+ "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n",
+ "runs a fixed number of steps of stepwise search."
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "554c67cb",
+ "execution_count": 6,
+ "id": "048c8500",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
+ "strategy = Stepwise.first_peak(design,\n",
+ " direction='forward',\n",
+ " max_terms=len(design.terms))"
]
},
{
"cell_type": "markdown",
- "id": "06956a6f",
+ "id": "e0c0af0e",
"metadata": {},
"source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
+ " \n",
+ "We now fit a linear regression model with `Salary` as outcome using forward\n",
+ "selection. To do so, we use the function `sklearn_selected()` from the `ISLP.models` package. This takes\n",
+ "a model from `statsmodels` along with a search strategy and selects a model with its\n",
+ "`fit` method. Without specifying a `scoring` argument, the score defaults to MSE, and so all 19 variables will be\n",
+ "selected."
]
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "dd434884",
+ "execution_count": 7,
+ "id": "26f09fe9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
+ "('Assists',\n",
+ " 'AtBat',\n",
+ " 'CAtBat',\n",
+ " 'CHits',\n",
+ " 'CHmRun',\n",
+ " 'CRBI',\n",
+ " 'CRuns',\n",
+ " 'CWalks',\n",
+ " 'Division',\n",
+ " 'Errors',\n",
+ " 'Hits',\n",
+ " 'HmRun',\n",
+ " 'League',\n",
+ " 'NewLeague',\n",
+ " 'PutOuts',\n",
+ " 'RBI',\n",
+ " 'Runs',\n",
+ " 'Walks',\n",
+ " 'Years')"
]
},
- "execution_count": 16,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
+ "hitters_MSE = sklearn_selected(OLS,\n",
+ " strategy)\n",
+ "hitters_MSE.fit(Hitters, Y)\n",
+ "hitters_MSE.selected_state_"
]
},
{
"cell_type": "markdown",
- "id": "5cdb088c",
+ "id": "4acf4792",
"metadata": {},
"source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
+ " Using `neg_Cp` results in a smaller model, as expected, with just 4variables selected."
]
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "519a642e",
+ "execution_count": 8,
+ "id": "a825f4d8",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
{
"data": {
"text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
+ "('Assists', 'Errors', 'League', 'NewLeague')"
]
},
- "execution_count": 17,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "403921a2",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
+ "hitters_Cp = sklearn_selected(OLS,\n",
+ " strategy,\n",
+ " scoring=negAIC)\n",
+ "hitters_Cp.fit(Hitters, Y)\n",
+ "hitters_Cp.selected_state_"
]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "b422cde1",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53e38f57",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "6347acb6",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "08b5ddb0",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "a8eb3e33",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "97912337",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "72b5e629",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "8a457e3e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " Price | \n",
- " Income | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8624ab8c",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "6052765e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "9158de59",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9608bed3",
- "metadata": {},
- "source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "f0b8120f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "270a02a6",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "4ffbce7e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bc5ff62b",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "34dae1e9",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7e9da262",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "938b9430",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "083e9529",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "d413a9fe",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0f4b508b",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8bcbd973",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "cf13f72e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "c1fa0a90",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "b28aa313",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "aa764acc",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "31876a29",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "bac2643c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1485735d",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "3987c5d6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7a6631c9",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "83a9b94e",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "f0ffabea",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "4a5fdc64",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "ae7e3bd2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "c12ac3df",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53bf8aef",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "47723bce",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "86060622",
- "metadata": {},
- "source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "d7a2ab9b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2a5e7f6b",
- "metadata": {},
- "source": [
- "Comparing this to the previous `R` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "bbb02036",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "89106a85",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "151f3fee",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "945ce7bc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "dtype: float64"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "450b94dd",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "18d5c1c8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "46c7d911",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3, )[0] 10.036373\n",
- "poly(Income, 3, )[1] -2.799156\n",
- "poly(Income, 3, )[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "99bf13a1",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "7606facd",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a4931031",
- "metadata": {},
- "source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "1c1bf5f3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 4.240421\n",
- "ns(Income, , df=5)[0] 1.468196\n",
- "ns(Income, , df=5)[1] 1.499471\n",
- "ns(Income, , df=5)[2] 1.152070\n",
- "ns(Income, , df=5)[3] 2.418398\n",
- "ns(Income, , df=5)[4] 1.804460\n",
- "ShelveLoc[Good] 4.810449\n",
- "ShelveLoc[Medium] 1.881095\n",
- "dtype: float64"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import ns, bs, pca\n",
- "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "8c24254b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n",
- " 4.240421 1.468196 1.499471 1.152070 \n",
- "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n",
- " 2.418398 1.804460 4.810449 1.881095 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "library(splines)\n",
- "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "f9d6c4a7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 3.495085\n",
- "bs(Income, , df=7, degree=2)[0] 1.813118\n",
- "bs(Income, , df=7, degree=2)[1] 0.961852\n",
- "bs(Income, , df=7, degree=2)[2] 2.471545\n",
- "bs(Income, , df=7, degree=2)[3] 2.158891\n",
- "bs(Income, , df=7, degree=2)[4] 2.091625\n",
- "bs(Income, , df=7, degree=2)[5] 2.600669\n",
- "bs(Income, , df=7, degree=2)[6] 2.843108\n",
- "ShelveLoc[Good] 4.804919\n",
- "ShelveLoc[Medium] 1.880337\n",
- "dtype: float64"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "0bf1726a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) bs(Income, df = 7, degree = 2)1 \n",
- " 3.4950851 1.8131176 \n",
- "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n",
- " 0.9618523 2.4715450 \n",
- "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n",
- " 2.1588908 2.0916252 \n",
- "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n",
- " 2.6006694 2.8431084 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.8049190 1.8803375 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "914df4cf",
- "metadata": {},
- "source": [
- "## PCA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "cc22e780",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.419405\n",
- "pca(myvars, , n_components=2)[0] -0.001131\n",
- "pca(myvars, , n_components=2)[1] -0.024217\n",
- "ShelveLoc[Good] 4.816253\n",
- "ShelveLoc[Medium] 1.924139\n",
- "dtype: float64"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars'), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "de571e61",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.419405 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n",
- " 0.001131 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n",
- " -0.024217 \n",
- " ShelveLocGood \n",
- " 4.816253 \n",
- " ShelveLocMedium \n",
- " 1.924139 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0a103b5a",
- "metadata": {},
- "source": [
- "It is of course common to scale before running PCA."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "id": "95ca42f5",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.352159\n",
- "pca(myvars, , n_components=2)[0] 0.446383\n",
- "pca(myvars, , n_components=2)[1] -1.219788\n",
- "ShelveLoc[Good] 4.922780\n",
- "ShelveLoc[Medium] 2.005617\n",
- "dtype: float64"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars',\n",
- " scale=True), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "id": "0dc22e35",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.3522 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n",
- " 0.4469 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n",
- " -1.2213 \n",
- " ShelveLocGood \n",
- " 4.9228 \n",
- " ShelveLocMedium \n",
- " 2.0056 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "70347ee9",
- "metadata": {},
- "source": [
- "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n",
- "of `np.std(ddof=1)`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "aa0c2f2e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.44694166, -1.22131519])"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ab05c497",
- "metadata": {},
- "source": [
- "## Model selection\n",
- "\n",
- "Another task requiring different design matrices is model selection. Manipulating\n",
- "the `terms` attribute of a `ModelSpec` (or more precisely its more uniform version `terms_`)\n",
- "can clearly allow for both exhaustive and stepwise model selection."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "id": "9505c178",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.strategy import (Stepwise, \n",
- " min_max)\n",
- "from ISLP.models.generic_selector import FeatureSelector"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "020c2532",
- "metadata": {},
- "source": [
- "### Best subsets"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "id": "f9aba6db",
- "metadata": {},
- "outputs": [],
- "source": [
- "design = ModelSpec(['Price', \n",
- " 'UIncome', \n",
- " 'Advertising', \n",
- " 'US', \n",
- " 'Income',\n",
- " 'ShelveLoc',\n",
- " 'Education',\n",
- " 'Urban']).fit(Carseats)\n",
- "strategy = min_max(design,\n",
- " min_terms=0,\n",
- " max_terms=3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "id": "91144a3d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.linear_model import LinearRegression\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "ae3cb2eb",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.fit(Carseats, Y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "id": "e63b2744",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Price', 'Advertising', 'ShelveLoc')"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "id": "0a774b48",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('Price',), ('UIncome',), ('Advertising',), ('US',), ('Income',), ('ShelveLoc',), ('Education',), ('Urban',), ('Price', 'UIncome'), ('Price', 'Advertising'), ('Price', 'US'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Price', 'Education'), ('Price', 'Urban'), ('UIncome', 'Advertising'), ('UIncome', 'US'), ('UIncome', 'Income'), ('UIncome', 'ShelveLoc'), ('UIncome', 'Education'), ('UIncome', 'Urban'), ('Advertising', 'US'), ('Advertising', 'Income'), ('Advertising', 'ShelveLoc'), ('Advertising', 'Education'), ('Advertising', 'Urban'), ('US', 'Income'), ('US', 'ShelveLoc'), ('US', 'Education'), ('US', 'Urban'), ('Income', 'ShelveLoc'), ('Income', 'Education'), ('Income', 'Urban'), ('ShelveLoc', 'Education'), ('ShelveLoc', 'Urban'), ('Education', 'Urban'), ('Price', 'UIncome', 'Advertising'), ('Price', 'UIncome', 'US'), ('Price', 'UIncome', 'Income'), ('Price', 'UIncome', 'ShelveLoc'), ('Price', 'UIncome', 'Education'), ('Price', 'UIncome', 'Urban'), ('Price', 'Advertising', 'US'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Advertising', 'Education'), ('Price', 'Advertising', 'Urban'), ('Price', 'US', 'Income'), ('Price', 'US', 'ShelveLoc'), ('Price', 'US', 'Education'), ('Price', 'US', 'Urban'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Income', 'Education'), ('Price', 'Income', 'Urban'), ('Price', 'ShelveLoc', 'Education'), ('Price', 'ShelveLoc', 'Urban'), ('Price', 'Education', 'Urban'), ('UIncome', 'Advertising', 'US'), ('UIncome', 'Advertising', 'Income'), ('UIncome', 'Advertising', 'ShelveLoc'), ('UIncome', 'Advertising', 'Education'), ('UIncome', 'Advertising', 'Urban'), ('UIncome', 'US', 'Income'), ('UIncome', 'US', 'ShelveLoc'), ('UIncome', 'US', 'Education'), ('UIncome', 'US', 'Urban'), ('UIncome', 'Income', 'ShelveLoc'), ('UIncome', 'Income', 'Education'), ('UIncome', 'Income', 'Urban'), ('UIncome', 'ShelveLoc', 'Education'), ('UIncome', 'ShelveLoc', 'Urban'), ('UIncome', 'Education', 'Urban'), ('Advertising', 'US', 'Income'), ('Advertising', 'US', 'ShelveLoc'), ('Advertising', 'US', 'Education'), ('Advertising', 'US', 'Urban'), ('Advertising', 'Income', 'ShelveLoc'), ('Advertising', 'Income', 'Education'), ('Advertising', 'Income', 'Urban'), ('Advertising', 'ShelveLoc', 'Education'), ('Advertising', 'ShelveLoc', 'Urban'), ('Advertising', 'Education', 'Urban'), ('US', 'Income', 'ShelveLoc'), ('US', 'Income', 'Education'), ('US', 'Income', 'Urban'), ('US', 'ShelveLoc', 'Education'), ('US', 'ShelveLoc', 'Urban'), ('US', 'Education', 'Urban'), ('Income', 'ShelveLoc', 'Education'), ('Income', 'ShelveLoc', 'Urban'), ('Income', 'Education', 'Urban'), ('ShelveLoc', 'Education', 'Urban')])"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "id": "0ca1f28c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Price', 'Advertising', 'Income')"
- ]
- },
- "execution_count": 64,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "strategy = min_max(design,\n",
- " min_terms=0,\n",
- " max_terms=3,\n",
- " lower_terms=['Price'],\n",
- " upper_terms=['Price', 'Income', 'Advertising'])\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error')\n",
- "selector.fit(Carseats, Y)\n",
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "id": "5c6732fa",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([('Price',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'Advertising', 'Income')])"
- ]
- },
- "execution_count": 65,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bb6fcc3",
- "metadata": {},
- "source": [
- "### Stepwise selection"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "id": "9985d0fc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Advertising', 'Income', 'Price', 'ShelveLoc')"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "strategy = Stepwise.first_peak(design,\n",
- " min_terms=0,\n",
- " max_terms=6,\n",
- " lower_terms=['Price'],\n",
- " upper_terms=['Price', 'Income', 'Advertising', 'ShelveLoc', 'UIncome', 'US'\n",
- " 'Education', 'Urban'])\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error',\n",
- " cv=3)\n",
- "selector.fit(Carseats, Y)\n",
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "id": "d3cf3e9b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('Price',), ('Price', 'UIncome'), ('Advertising', 'Price'), ('Income', 'Price'), ('Price', 'ShelveLoc'), ('Price', 'Urban'), ('Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Price', 'ShelveLoc'), ('Income', 'Price', 'ShelveLoc'), ('Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc'), ('Advertising', 'Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban')])"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "id": "dd43ea7c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{(): -8.055847677297269,\n",
- " ('Price',): -6.514630258019962,\n",
- " ('Price', 'UIncome'): -6.621654905418576,\n",
- " ('Advertising', 'Price'): -5.825225309857156,\n",
- " ('Income', 'Price'): -6.455432795910743,\n",
- " ('Price', 'ShelveLoc'): -3.780183168075897,\n",
- " ('Price', 'Urban'): -6.5430157266926114,\n",
- " ('Price', 'ShelveLoc', 'UIncome'): -3.6938729706475004,\n",
- " ('Advertising', 'Price', 'ShelveLoc'): -3.2067316025050645,\n",
- " ('Income', 'Price', 'ShelveLoc'): -3.634698914456587,\n",
- " ('Price', 'ShelveLoc', 'Urban'): -3.776148947585277,\n",
- " ('Advertising', 'Price', 'ShelveLoc', 'UIncome'): -3.1240961493998642,\n",
- " ('Advertising', 'Income', 'Price', 'ShelveLoc'): -3.0801704971796244,\n",
- " ('Advertising', 'Price', 'ShelveLoc', 'Urban'): -3.207569489139369,\n",
- " ('Advertising',\n",
- " 'Income',\n",
- " 'Price',\n",
- " 'ShelveLoc',\n",
- " 'UIncome'): -3.1048826894036115,\n",
- " ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban'): -3.0867130108677423}"
- ]
- },
- "execution_count": 68,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "id": "7c026f0a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Advertising', 'Income', 'Price', 'ShelveLoc')"
- ]
- },
- "execution_count": 69,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b4b89d04",
- "metadata": {},
- "source": [
- "### Enforcing constraints\n",
- "\n",
- "In models with interactions, we may often want to impose constraints on interactions and main effects.\n",
- "This can be achieved here by use of a `validator` that checks whether a given model is valid.\n",
- "\n",
- "Suppose we want to have the following constraint: `ShelveLoc` may not be in the model unless\n",
- "`Price` is in the following model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "id": "1c1e31d0",
- "metadata": {},
- "outputs": [],
- "source": [
- "design = ModelSpec(['Price', \n",
- " 'Advertising', \n",
- " 'Income',\n",
- " 'ShelveLoc']).fit(Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "be929807",
- "metadata": {},
- "source": [
- "The constraints are described with a boolean matrix with `(i,j)` as `j` is a child of `i`: so `j` should not\n",
- "be in the model when `i` is not and enforced with a callable `validator` that evaluates each candidate state.\n",
- "\n",
- "Both `min_max_strategy` and `step_strategy` accept a `validator` argument."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "id": "c075b1b7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('Price',), ('Advertising',), ('Income',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Advertising', 'Income'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Advertising', 'Income', 'ShelveLoc')])"
- ]
- },
- "execution_count": 71,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.strategy import validator_from_constraints\n",
- "constraints = np.zeros((4, 4))\n",
- "constraints[0,3] = 1\n",
- "strategy = min_max(design,\n",
- " min_terms=0,\n",
- " max_terms=4,\n",
- " validator=validator_from_constraints(design,\n",
- " constraints))\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error',\n",
- " cv=3)\n",
- "selector.fit(Carseats, Y)\n",
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "id": "3472d47c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Price', 'Advertising', 'Income', 'ShelveLoc')"
- ]
- },
- "execution_count": 72,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "id": "5d2c82b9",
- "metadata": {},
- "outputs": [],
- "source": [
- "Hitters=load_data('Hitters')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
- "id": "4b2ac2c2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n",
- " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n",
- " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 74,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Hitters.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "id": "bd2ad0dd",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('AtBat',), ('Hits',), ('HmRun',), ('Runs',), ('RBI',), ('Walks',), ('Years',), ('CAtBat',), ('CHits',), ('CHmRun',), ('CRuns',), ('CRBI',), ('CWalks',), ('League',), ('Division',), ('PutOuts',), ('Assists',), ('Errors',), ('NewLeague',), ('AtBat', 'CRBI'), ('CRBI', 'Hits'), ('CRBI', 'HmRun'), ('CRBI', 'Runs'), ('CRBI', 'RBI'), ('CRBI', 'Walks'), ('CRBI', 'Years'), ('CAtBat', 'CRBI'), ('CHits', 'CRBI'), ('CHmRun', 'CRBI'), ('CRBI', 'CRuns'), ('CRBI', 'CWalks'), ('CRBI', 'League'), ('CRBI', 'Division'), ('CRBI', 'PutOuts'), ('Assists', 'CRBI'), ('CRBI', 'Errors'), ('CRBI', 'NewLeague'), ('AtBat', 'CRBI', 'Hits'), ('CRBI', 'Hits', 'HmRun'), ('CRBI', 'Hits', 'Runs'), ('CRBI', 'Hits', 'RBI'), ('CRBI', 'Hits', 'Walks'), ('CRBI', 'Hits', 'Years'), ('CAtBat', 'CRBI', 'Hits'), ('CHits', 'CRBI', 'Hits'), ('CHmRun', 'CRBI', 'Hits'), ('CRBI', 'CRuns', 'Hits'), ('CRBI', 'CWalks', 'Hits'), ('CRBI', 'Hits', 'League'), ('CRBI', 'Division', 'Hits'), ('CRBI', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits'), ('CRBI', 'Errors', 'Hits'), ('CRBI', 'Hits', 'NewLeague'), ('AtBat', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'League', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years')])"
- ]
- },
- "execution_count": 75,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Hitters = Hitters.dropna()\n",
- "Y=Hitters['Salary']\n",
- "X=Hitters.drop('Salary', axis=1)\n",
- "design = ModelSpec(X.columns).fit(X)\n",
- "strategy = Stepwise.first_peak(design,\n",
- " direction='forward',\n",
- " min_terms=0,\n",
- " max_terms=19)\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error', cv=None)\n",
- "selector.fit(X, Y)\n",
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "id": "31788748",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19"
- ]
- },
- "execution_count": 76,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(selector.selected_state_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 77,
- "id": "e97d80c3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19"
- ]
- },
- "execution_count": 77,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(X.columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a71f0332",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Start: AIC=3215.77\n",
- "Salary ~ 1\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ CRBI 1 17139434 36179679 3115.8\n",
- "+ CRuns 1 16881162 36437951 3117.6\n",
- "+ CHits 1 16065140 37253973 3123.5\n",
- "+ CAtBat 1 14759710 38559403 3132.5\n",
- "+ CHmRun 1 14692193 38626920 3133.0\n",
- "+ CWalks 1 12792622 40526491 3145.6\n",
- "+ RBI 1 10771083 42548030 3158.4\n",
- "+ Walks 1 10504833 42814280 3160.1\n",
- "+ Hits 1 10260491 43058621 3161.6\n",
- "+ Runs 1 9399158 43919955 3166.8\n",
- "+ Years 1 8559105 44760007 3171.7\n",
- "+ AtBat 1 8309469 45009644 3173.2\n",
- "+ HmRun 1 6273967 47045145 3184.8\n",
- "+ PutOuts 1 4814100 48505013 3192.9\n",
- "+ Division 1 1976102 51343011 3207.8\n",
- " 53319113 3215.8\n",
- "+ Assists 1 34497 53284615 3217.6\n",
- "+ League 1 10876 53308237 3217.7\n",
- "+ Errors 1 1555 53317558 3217.8\n",
- "+ NewLeague 1 428 53318684 3217.8\n",
- "\n",
- "Step: AIC=3115.78\n",
- "Salary ~ CRBI\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ Hits 1 5533119 30646560 3074.1\n",
- "+ Runs 1 5176532 31003147 3077.2\n",
- "+ Walks 1 4199733 31979946 3085.3\n",
- "+ AtBat 1 4064585 32115095 3086.4\n",
- "+ RBI 1 3308272 32871407 3092.6\n",
- "+ PutOuts 1 3267035 32912644 3092.9\n",
- "+ Division 1 1733887 34445793 3104.9\n",
- "+ Years 1 1667339 34512340 3105.4\n",
- "+ HmRun 1 1271587 34908092 3108.4\n",
- "+ CRuns 1 354561 35825119 3115.2\n",
- "+ Assists 1 346020 35833659 3115.2\n",
- " 36179679 3115.8\n",
- "+ Errors 1 194403 35985276 3116.4\n",
- "+ CAtBat 1 92261 36087418 3117.1\n",
- "+ CHits 1 75469 36104210 3117.2\n",
- "+ CWalks 1 51974 36127705 3117.4\n",
- "+ NewLeague 1 17778 36161901 3117.7\n",
- "+ League 1 11825 36167855 3117.7\n",
- "+ CHmRun 1 515 36179165 3117.8\n",
- "\n",
- "Step: AIC=3074.13\n",
- "Salary ~ CRBI + Hits\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ PutOuts 1 1397263 29249297 3063.8\n",
- "+ Division 1 1279275 29367285 3064.9\n",
- "+ AtBat 1 821767 29824793 3069.0\n",
- "+ Walks 1 781767 29864793 3069.3\n",
- "+ Years 1 254910 30391650 3073.9\n",
- " 30646560 3074.1\n",
- "+ League 1 208880 30437680 3074.3\n",
- "+ CRuns 1 132614 30513946 3075.0\n",
- "+ NewLeague 1 118474 30528086 3075.1\n",
- "+ Runs 1 114198 30532362 3075.1\n",
- "+ Errors 1 99776 30546784 3075.3\n",
- "+ CAtBat 1 83517 30563043 3075.4\n",
- "+ Assists 1 44781 30601779 3075.7\n",
- "+ CWalks 1 23668 30622892 3075.9\n",
- "+ CHmRun 1 4790 30641769 3076.1\n",
- "+ CHits 1 4358 30642202 3076.1\n",
- "+ HmRun 1 2173 30644387 3076.1\n",
- "+ RBI 1 1137 30645423 3076.1\n",
- "\n",
- "Step: AIC=3063.85\n",
- "Salary ~ CRBI + Hits + PutOuts\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ Division 1 1278445 27970852 3054.1\n",
- "+ AtBat 1 1009933 28239364 3056.6\n",
- "+ Walks 1 539490 28709807 3061.0\n",
- "+ CRuns 1 273649 28975648 3063.4\n",
- " 29249297 3063.8\n",
- "+ Years 1 136906 29112391 3064.6\n",
- "+ League 1 122841 29126456 3064.8\n",
- "+ Runs 1 117930 29131367 3064.8\n",
- "+ Errors 1 97244 29152053 3065.0\n",
- "+ NewLeague 1 57839 29191458 3065.3\n",
- "+ CHits 1 35096 29214201 3065.5\n",
- "+ RBI 1 33965 29215331 3065.6\n",
- "+ HmRun 1 31227 29218070 3065.6\n",
- "+ CWalks 1 28572 29220725 3065.6\n",
- "+ CAtBat 1 20518 29228779 3065.7\n",
- "+ Assists 1 1681 29247616 3065.8\n",
- "+ CHmRun 1 1419 29247878 3065.8\n",
- "\n",
- "Step: AIC=3054.1\n",
- "Salary ~ CRBI + Hits + PutOuts + Division\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ AtBat 1 820952 27149899 3048.3\n",
- "+ Walks 1 491584 27479268 3051.4\n",
- " 27970852 3054.1\n",
- "+ CRuns 1 193604 27777248 3054.3\n",
- "+ Years 1 166845 27804007 3054.5\n",
- "+ League 1 110628 27860224 3055.1\n",
- "+ Errors 1 81385 27889467 3055.3\n",
- "+ Runs 1 65921 27904931 3055.5\n",
- "+ RBI 1 53719 27917133 3055.6\n",
- "+ NewLeague 1 52275 27918577 3055.6\n",
- "+ CHits 1 33863 27936989 3055.8\n",
- "+ HmRun 1 26390 27944462 3055.8\n",
- "+ CAtBat 1 18751 27952101 3055.9\n",
- "+ CWalks 1 5723 27965129 3056.0\n",
- "+ Assists 1 1036 27969816 3056.1\n",
- "+ CHmRun 1 165 27970687 3056.1\n",
- "\n",
- "Step: AIC=3048.26\n",
- "Salary ~ CRBI + Hits + PutOuts + Division + AtBat\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ Walks 1 954996 26194904 3040.8\n",
- "+ Years 1 253362 26896537 3047.8\n",
- "+ Runs 1 208743 26941157 3048.2\n",
- " 27149899 3048.3\n",
- "+ CRuns 1 185825 26964075 3048.5\n",
- "+ League 1 95986 27053913 3049.3\n",
- "+ NewLeague 1 52693 27097206 3049.8\n",
- "+ CHmRun 1 43173 27106726 3049.8\n",
- "+ Assists 1 28898 27121001 3050.0\n",
- "+ CAtBat 1 20989 27128910 3050.1\n",
- "+ CWalks 1 15599 27134301 3050.1\n",
- "+ Errors 1 6265 27143634 3050.2\n",
- "+ CHits 1 5305 27144594 3050.2\n",
- "+ RBI 1 1236 27148663 3050.2\n",
- "+ HmRun 1 11 27149888 3050.3\n",
- "\n",
- "Step: AIC=3040.85\n",
- "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ CWalks 1 240687 25954217 3040.4\n",
- " 26194904 3040.8\n",
- "+ Years 1 184508 26010396 3041.0\n",
- "+ CRuns 1 110695 26084209 3041.7\n",
- "+ League 1 77974 26116930 3042.1\n",
- "+ Assists 1 75782 26119122 3042.1\n",
- "+ NewLeague 1 40909 26153995 3042.4\n",
- "+ CHits 1 37304 26157599 3042.5\n",
- "+ RBI 1 11728 26183176 3042.7\n",
- "+ HmRun 1 4747 26190157 3042.8\n",
- "+ Errors 1 2727 26192177 3042.8\n",
- "+ CAtBat 1 2630 26192274 3042.8\n",
- "+ CHmRun 1 943 26193961 3042.8\n",
- "+ Runs 1 37 26194867 3042.8\n",
- "\n",
- "Step: AIC=3040.42\n",
- "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks + CWalks\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ CRuns 1 794983 25159234 3034.2\n",
- "+ CHits 1 273728 25680489 3039.6\n",
- " 25954217 3040.4\n",
- "+ Assists 1 138506 25815711 3041.0\n",
- "+ CAtBat 1 89289 25864929 3041.5\n",
- "+ RBI 1 86941 25867276 3041.5\n",
- "+ League 1 77159 25877058 3041.6\n",
- "+ Years 1 70126 25884091 3041.7\n",
- "+ NewLeague 1 37807 25916410 3042.0\n",
- "+ HmRun 1 33601 25920616 3042.1\n",
- "+ CHmRun 1 9034 25945183 3042.3\n",
- "+ Errors 1 6928"
- ]
- }
- ],
- "source": [
- "%%R -i Hitters\n",
- "step(lm(Salary ~ 1, data=Hitters), scope=list(upper=lm(Salary ~ ., data=Hitters)), direction='forward', trace=TRUE)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6117f650",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "536a8bc3",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bddc13c5",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -2726,9 +262,9 @@
"formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
diff --git a/docs/jupyterbook/models/selection.md b/docs/jupyterbook/models/selection.md
index c868c75..949ccc1 100644
--- a/docs/jupyterbook/models/selection.md
+++ b/docs/jupyterbook/models/selection.md
@@ -5,670 +5,107 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Model selection using `ModelSpec`
-```{code-cell} ipython3
-import numpy as np, pandas as pd
-%load_ext rpy2.ipython
-
-from ISLP import load_data
-from ISLP.models import ModelSpec
-
-import statsmodels.api as sm
-```
-
-```{code-cell} ipython3
-Carseats = load_data('Carseats')
-%R -i Carseats
-Carseats.columns
-```
-
-## Let's break up income into groups
-
-```{code-cell} ipython3
-Carseats['OIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'])
-Carseats['OIncome']
-```
-
-Let's also create an unordered version
-
-```{code-cell} ipython3
-Carseats['UIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'],
- ordered=False)
-Carseats['UIncome']
-```
-
-## A simple model
-
-```{code-cell} ipython3
-design = ModelSpec(['Price', 'Income'])
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-Y = Carseats['Sales']
-M = sm.OLS(Y, X).fit()
-M.params
-```
-
-## Basic procedure
-
-The design matrix is built by cobbling together a set of columns and possibly transforming them.
-A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`
-is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:
-
-```{code-cell} ipython3
-Carseats['ShelveLoc']
-```
-
-This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods
-`get_columns` and `fit_encoder`.
-
-```{code-cell} ipython3
-design.column_info_['ShelveLoc']
-```
-
-It recognized ordinal columns as well.
-
-```{code-cell} ipython3
-design.column_info_['OIncome']
-```
-
-```{code-cell} ipython3
-income = design.column_info_['Income']
-cols, names = income.get_columns(Carseats)
-(cols[:4], names)
-```
-
-## Encoding a column
-
-In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical
-variables usually are encoded by several columns, typically one less than the number of categories.
-This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform
-model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits
-its encoder the first time data is passed to it.
-
-```{code-cell} ipython3
-shelve = design.column_info_['ShelveLoc']
-cols, names = shelve.get_columns(Carseats)
-(cols[:4], names)
-```
-
-```{code-cell} ipython3
-oincome = design.column_info_['OIncome']
-oincome.get_columns(Carseats)[0][:4]
-```
-
-## The terms
-
-The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through
-the `terms` argument which should be a sequence. The elements of `terms` are often
-going to be strings (or tuples of strings for interactions, see below) but are converted to a
-`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple.
-
-```{code-cell} ipython3
-design.terms
-```
-
-```{code-cell} ipython3
-design.terms_
-```
-
-While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A
-`Variable` can also create columns through the `build_columns` method of `ModelSpec`
-
-```{code-cell} ipython3
-price = design.terms_[0]
-design.build_columns(Carseats, price)
-```
-
-Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The
-tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then
-is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during
-the call to `ModelSpec.fit`.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import Variable
-
-new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)
-design.build_columns(Carseats, new_var)
-```
-
-Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the
-arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`.
-
-```{code-cell} ipython3
-from sklearn.decomposition import PCA
-pca = PCA(n_components=2)
-pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`
-pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)
-design.build_columns(Carseats, pca_var)
-```
-
-The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`)
-or `Variable` instances (`pca_var`).
-
-```{code-cell} ipython3
-fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)
-design.build_columns(Carseats, fancy_var)
-```
-
-We can of course run PCA again on these features (if we wanted).
-
-```{code-cell} ipython3
-pca2 = PCA(n_components=2)
-pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`
-pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)
-design.build_columns(Carseats, pca2_var)
-```
-
-## Building the design matrix
-
-With these notions in mind, the final design is essentially then
-
-```{code-cell} ipython3
-X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]
-```
-
-An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is
-a dataframe the index is adjusted accordingly.
-
-```{code-cell} ipython3
-design.intercept
-```
-
-```{code-cell} ipython3
-design.transform(Carseats)[:4]
-```
-
-## Predicting
-
-Constructing the design matrix at any values is carried out by the `transform` method.
-
-```{code-cell} ipython3
-new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})
-new_X = design.transform(new_data)
-M.get_prediction(new_X).predicted_mean
-```
-
-```{code-cell} ipython3
-%%R -i new_data,Carseats
-predict(lm(Sales ~ Price + Income, data=Carseats), new_data)
-```
-
-### Difference between using `pd.DataFrame` and `np.ndarray`
-
-If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.
-
-If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,
-in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning.
-
-```{code-cell} ipython3
-Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])
-design_np = ModelSpec([0,3]).fit(Carseats_np)
-design_np.transform(Carseats_np)[:4]
-```
-
-The following will fail for hopefully obvious reasons
-
-```{code-cell} ipython3
-try:
- new_D = np.zeros((2,2))
- new_D[:,0] = [10,20]
- new_D[:,1] = [40,50]
- M.get_prediction(new_D).predicted_mean
-except ValueError as e:
- print(e)
-```
-
-Ultimately, `M` expects 3 columns for new predictions because it was fit
-with a matrix having 3 columns (the first representing an intercept).
-
-We might be tempted to try as with the `pd.DataFrame` and produce
-an `np.ndarray` with only the necessary variables.
-
-```{code-cell} ipython3
-try:
- new_X = np.zeros((2,2))
- new_X[:,0] = [10,20]
- new_X[:,1] = [40,50]
- new_D = design_np.transform(new_X)
- M.get_prediction(new_D).predicted_mean
-except IndexError as e:
- print(e)
-```
-
-This fails because `design_np` is looking for column `3` from its `terms`:
-
-```{code-cell} ipython3
-design_np.terms_
-```
-
-However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed
-represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:
-
-```{code-cell} ipython3
-new_X = np.zeros((2,4))
-new_X[:,0] = [10,20]
-new_X[:,3] = [40,50]
-new_D = design_np.transform(new_X)
-M.get_prediction(new_D).predicted_mean
-```
-
-Given this subtlety about needing to supply arrays with identical column structure to `transform` when
-using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case.
-
-+++
-
-## A model with some categorical variables
-
-Categorical variables become `Column` instances with encoders.
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)
-design.column_info_['UIncome']
-```
-
-```{code-cell} ipython3
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Getting the encoding you want
-
-By default the level dropped by `ModelSpec` will be the first of the `categories_` values from
-`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems
-as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`
-where `new_encoder` would somehow drop the column we want dropped.
-
-However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:
-
-```{code-cell} ipython3
-design.column_info_['UIncome']
-```
-
-This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when
-we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest
-and we need a way to allow different encodings of the same column of `Carseats`
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
- We can create a new
-`Column` with the encoder we want. For categorical variables, there is a convenience function to do so.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import contrast
-pref_encoding = contrast('UIncome', 'drop', 'L')
-```
-
-```{code-cell} ipython3
-design.build_columns(Carseats, pref_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Interactions
-
-We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument
-to `ModelSpec`.
-
-```{code-cell} ipython3
-design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with
-`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`.
-
-```{code-cell} ipython3
-design.terms_[0]
-```
-
-Comparing this to the previous `R` model.
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
-We note a few important things:
-
-1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these
-columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**
-
-2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**
-
-A few notes:
-
-- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**
-- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily.
-
-```{code-cell} ipython3
-full_encoding = contrast('UIncome', None)
-design.build_columns(Carseats, full_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-## Special encodings
-
-For flexible models, we may want to consider transformations of features, i.e. polynomial
-or spline transformations. Given transforms that follow the `fit/transform` paradigm
-we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`
-package includes a `Poly` transform
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import poly
-poly('Income', 3)
-```
-
-```{code-cell} ipython3
-design = ModelSpec([poly('Income', 3), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-Compare:
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef
-```
-
-## Splines
-
-Support for natural and B-splines is also included
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import ns, bs, pca
-design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-library(splines)
-lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef
-```
-
-```{code-cell} ipython3
-design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef
-```
-
-## PCA
-
-```{code-cell} ipython3
-design = ModelSpec([pca(['Income',
- 'Price',
- 'Advertising',
- 'Population'],
- n_components=2,
- name='myvars'), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
+In this lab we illustrate how to run forward stepwise model selection
+using the model specification capability of `ModelSpec`.
```{code-cell} ipython3
-%%R
-lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)
-```
-
-It is of course common to scale before running PCA.
-
-```{code-cell} ipython3
-design = ModelSpec([pca(['Income',
- 'Price',
- 'Advertising',
- 'Population'],
- n_components=2,
- name='myvars',
- scale=True), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)
-```
-
-There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead
-of `np.std(ddof=1)`.
-
-```{code-cell} ipython3
-np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))
-```
-
-## Model selection
-
-Another task requiring different design matrices is model selection. Manipulating
-the `terms` attribute of a `ModelSpec` (or more precisely its more uniform version `terms_`)
-can clearly allow for both exhaustive and stepwise model selection.
-
-```{code-cell} ipython3
-from ISLP.models.strategy import (Stepwise,
- min_max)
-from ISLP.models.generic_selector import FeatureSelector
-```
-
-### Best subsets
-
-```{code-cell} ipython3
-design = ModelSpec(['Price',
- 'UIncome',
- 'Advertising',
- 'US',
- 'Income',
- 'ShelveLoc',
- 'Education',
- 'Urban']).fit(Carseats)
-strategy = min_max(design,
- min_terms=0,
- max_terms=3)
-```
-
-```{code-cell} ipython3
-from sklearn.linear_model import LinearRegression
-selector = FeatureSelector(LinearRegression(fit_intercept=False),
- strategy,
- scoring='neg_mean_squared_error')
-```
-
-```{code-cell} ipython3
-selector.fit(Carseats, Y)
-```
-
-```{code-cell} ipython3
-selector.selected_state_
-```
-
-```{code-cell} ipython3
-selector.results_.keys()
-```
-
-```{code-cell} ipython3
-strategy = min_max(design,
- min_terms=0,
- max_terms=3,
- lower_terms=['Price'],
- upper_terms=['Price', 'Income', 'Advertising'])
-selector = FeatureSelector(LinearRegression(fit_intercept=False),
- strategy,
- scoring='neg_mean_squared_error')
-selector.fit(Carseats, Y)
-selector.selected_state_
-```
-
-```{code-cell} ipython3
-selector.results_.keys()
+import numpy as np
+import pandas as pd
+from statsmodels.api import OLS
+from ISLP import load_data
+from ISLP.models import (ModelSpec,
+ Stepwise,
+ sklearn_selected)
```
-### Stepwise selection
+### Forward Selection
+
+We will apply the forward-selection approach to the `Hitters`
+data. We wish to predict a baseball player’s `Salary` on the
+basis of various statistics associated with performance in the
+previous year.
```{code-cell} ipython3
-strategy = Stepwise.first_peak(design,
- min_terms=0,
- max_terms=6,
- lower_terms=['Price'],
- upper_terms=['Price', 'Income', 'Advertising', 'ShelveLoc', 'UIncome', 'US'
- 'Education', 'Urban'])
-selector = FeatureSelector(LinearRegression(fit_intercept=False),
- strategy,
- scoring='neg_mean_squared_error',
- cv=3)
-selector.fit(Carseats, Y)
-selector.selected_state_
+Hitters = load_data('Hitters')
+np.isnan(Hitters['Salary']).sum()
```
-```{code-cell} ipython3
-selector.results_.keys()
-```
-
-```{code-cell} ipython3
-selector.results_
-```
+
+ We see that `Salary` is missing for 59 players. The
+`dropna()` method of data frames removes all of the rows that have missing
+values in any variable (by default --- see `Hitters.dropna?`).
```{code-cell} ipython3
-selector.selected_state_
+Hitters = Hitters.dropna()
+Hitters.shape
```
-### Enforcing constraints
-
-In models with interactions, we may often want to impose constraints on interactions and main effects.
-This can be achieved here by use of a `validator` that checks whether a given model is valid.
-
-Suppose we want to have the following constraint: `ShelveLoc` may not be in the model unless
-`Price` is in the following model.
+We first choose the best model using forward selection based on AIC. This score
+is not built in as a metric to `sklearn`. We therefore define a function to compute it ourselves, and use
+it as a scorer. By default, `sklearn` tries to maximize a score, hence
+ our scoring function computes the negative AIC statistic.
```{code-cell} ipython3
-design = ModelSpec(['Price',
- 'Advertising',
- 'Income',
- 'ShelveLoc']).fit(Carseats)
+def negAIC(estimator, X, Y):
+ "Negative AIC"
+ n, p = X.shape
+ Yhat = estimator.predict(X)
+ MSE = np.mean((Y - Yhat)**2)
+ return n + n * np.log(MSE) + 2 * (p + 1)
+
```
-The constraints are described with a boolean matrix with `(i,j)` as `j` is a child of `i`: so `j` should not
-be in the model when `i` is not and enforced with a callable `validator` that evaluates each candidate state.
-
-Both `min_max_strategy` and `step_strategy` accept a `validator` argument.
+We need to estimate the residual variance $\sigma^2$, which is the first argument in our scoring function above.
+We will fit the biggest model, using all the variables, and estimate $\sigma^2$ based on its MSE.
```{code-cell} ipython3
-from ISLP.models.strategy import validator_from_constraints
-constraints = np.zeros((4, 4))
-constraints[0,3] = 1
-strategy = min_max(design,
- min_terms=0,
- max_terms=4,
- validator=validator_from_constraints(design,
- constraints))
-selector = FeatureSelector(LinearRegression(fit_intercept=False),
- strategy,
- scoring='neg_mean_squared_error',
- cv=3)
-selector.fit(Carseats, Y)
-selector.results_.keys()
+design = ModelSpec(Hitters.columns.drop('Salary')).fit(Hitters)
+Y = np.array(Hitters['Salary'])
+X = design.transform(Hitters)
```
-```{code-cell} ipython3
-selector.selected_state_
-```
-
-```{code-cell} ipython3
-Hitters=load_data('Hitters')
-```
+Along with a score we need to specify the search strategy. This is done through the object
+`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`
+runs forward stepwise until any further additions to the model do not result
+in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`
+runs a fixed number of steps of stepwise search.
```{code-cell} ipython3
-Hitters.columns
-```
-
-```{code-cell} ipython3
-Hitters = Hitters.dropna()
-Y=Hitters['Salary']
-X=Hitters.drop('Salary', axis=1)
-design = ModelSpec(X.columns).fit(X)
strategy = Stepwise.first_peak(design,
direction='forward',
- min_terms=0,
- max_terms=19)
-selector = FeatureSelector(LinearRegression(fit_intercept=False),
- strategy,
- scoring='neg_mean_squared_error', cv=None)
-selector.fit(X, Y)
-selector.results_.keys()
+ max_terms=len(design.terms))
```
-```{code-cell} ipython3
-len(selector.selected_state_)
-```
+
+We now fit a linear regression model with `Salary` as outcome using forward
+selection. To do so, we use the function `sklearn_selected()` from the `ISLP.models` package. This takes
+a model from `statsmodels` along with a search strategy and selects a model with its
+`fit` method. Without specifying a `scoring` argument, the score defaults to MSE, and so all 19 variables will be
+selected.
```{code-cell} ipython3
-len(X.columns)
+hitters_MSE = sklearn_selected(OLS,
+ strategy)
+hitters_MSE.fit(Hitters, Y)
+hitters_MSE.selected_state_
```
-```{code-cell} ipython3
-%%R -i Hitters
-step(lm(Salary ~ 1, data=Hitters), scope=list(upper=lm(Salary ~ ., data=Hitters)), direction='forward', trace=TRUE)
-```
+ Using `neg_Cp` results in a smaller model, as expected, with just 4variables selected.
```{code-cell} ipython3
-
-```
-
-```{code-cell} ipython3
-
-```
-
-```{code-cell} ipython3
-
+hitters_Cp = sklearn_selected(OLS,
+ strategy,
+ scoring=negAIC)
+hitters_Cp.fit(Hitters, Y)
+hitters_Cp.selected_state_
```
diff --git a/docs/jupyterbook/models/spec.ipynb b/docs/jupyterbook/models/spec.ipynb
index b60e402..fce6b32 100644
--- a/docs/jupyterbook/models/spec.ipynb
+++ b/docs/jupyterbook/models/spec.ipynb
@@ -7,7 +7,14 @@
"source": [
"# Building design matrices with `ModelSpec`\n",
"\n",
- "Force rebuild"
+ "The `ISLP` package provides a facility to build design\n",
+ "matrices for regression and classification tasks. It provides similar functionality to the formula\n",
+ "notation of `R` though uses python objects rather than specification through the special formula syntax.\n",
+ "\n",
+ "Related tools include `patsy` and `ColumnTransformer` from `sklearn.compose`. \n",
+ "\n",
+ "Perhaps the most common use is to extract some columns from a `pd.DataFrame` and \n",
+ "produce a design matrix, optionally with an intercept."
]
},
{
@@ -17,12 +24,15 @@
"metadata": {},
"outputs": [],
"source": [
- "x=4\n",
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
"\n",
"from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
+ "from ISLP.models import (ModelSpec,\n",
+ " summarize,\n",
+ " Column,\n",
+ " Feature,\n",
+ " build_columns)\n",
"\n",
"import statsmodels.api as sm"
]
@@ -48,40 +58,42 @@
],
"source": [
"Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
"Carseats.columns"
]
},
{
"cell_type": "markdown",
- "id": "excellent-hamilton",
+ "id": "b7a2e6ab-491d-4a57-8184-a9fcccb2047b",
"metadata": {},
"source": [
- "## Let's break up income into groups"
+ "We'll first build a design matrix that we can use to model `Sales`\n",
+ "in terms of the categorical variable `ShelveLoc` and `Price`.\n",
+ "\n",
+ "We see first that `ShelveLoc` is a categorical variable:"
]
},
{
"cell_type": "code",
"execution_count": 3,
- "id": "going-administrator",
+ "id": "7d3642a6-90c6-48ad-8d35-88231b4991f8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
+ "0 Bad\n",
+ "1 Good\n",
+ "2 Medium\n",
+ "3 Medium\n",
+ "4 Bad\n",
+ " ... \n",
+ "395 Good\n",
+ "396 Medium\n",
+ "397 Medium\n",
+ "398 Bad\n",
+ "399 Good\n",
+ "Name: ShelveLoc, Length: 400, dtype: category\n",
+ "Categories (3, object): ['Bad', 'Good', 'Medium']"
]
},
"execution_count": 3,
@@ -90,42 +102,142 @@
}
],
"source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
+ "Carseats['ShelveLoc']"
]
},
{
"cell_type": "markdown",
- "id": "warming-mobile",
+ "id": "4afa201d-4b19-4d85-9e1b-1392a54d027b",
"metadata": {},
"source": [
- "Let's also create an unordered version"
+ "This is recognized by `ModelSpec` and only 2 columns are added for the three levels. The\n",
+ "default behavior is to drop the first level of the categories. Later, \n",
+ "we will show other contrasts of the 3 columns can be produced. \n",
+ "\n",
+ "This simple example below illustrates how the first argument (its `terms`) is\n",
+ "used to construct a design matrix."
]
},
{
"cell_type": "code",
"execution_count": 4,
- "id": "varying-fourth",
+ "id": "fd5528fe-11da-4e10-8996-06085896c1a0",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " intercept | \n",
+ " ShelveLoc[Good] | \n",
+ " ShelveLoc[Medium] | \n",
+ " Price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 80 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 97 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 128 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 72 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 108 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
+ " intercept ShelveLoc[Good] ShelveLoc[Medium] Price\n",
+ "0 1.0 0.0 0.0 120\n",
+ "1 1.0 1.0 0.0 83\n",
+ "2 1.0 0.0 1.0 80\n",
+ "3 1.0 0.0 1.0 97\n",
+ "4 1.0 0.0 0.0 128\n",
+ "5 1.0 0.0 0.0 72\n",
+ "6 1.0 0.0 1.0 108\n",
+ "7 1.0 1.0 0.0 120\n",
+ "8 1.0 0.0 1.0 124\n",
+ "9 1.0 0.0 1.0 124"
]
},
"execution_count": 4,
@@ -134,31 +246,129 @@
}
],
"source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
+ "MS = ModelSpec(['ShelveLoc', 'Price'])\n",
+ "X = MS.fit_transform(Carseats)\n",
+ "X.iloc[:10]"
]
},
{
"cell_type": "markdown",
- "id": "utility-viking",
+ "id": "6948e1ef-3685-4840-a4f2-ef15a1bcfb69",
"metadata": {},
"source": [
- "## A simple model"
+ "We note that a column has been added for the intercept by default. This can be changed using the\n",
+ "`intercept` argument."
]
},
{
"cell_type": "code",
"execution_count": 5,
- "id": "unlikely-begin",
+ "id": "682d4c81-eba9-467d-a176-911a0269a21d",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ShelveLoc[Good] | \n",
+ " ShelveLoc[Medium] | \n",
+ " Price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 80 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 97 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 128 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 72 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 108 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
+ " ShelveLoc[Good] ShelveLoc[Medium] Price\n",
+ "0 0.0 0.0 120\n",
+ "1 1.0 0.0 83\n",
+ "2 0.0 1.0 80\n",
+ "3 0.0 1.0 97\n",
+ "4 0.0 0.0 128\n",
+ "5 0.0 0.0 72\n",
+ "6 0.0 1.0 108\n",
+ "7 1.0 0.0 120\n",
+ "8 0.0 1.0 124\n",
+ "9 0.0 1.0 124"
]
},
"execution_count": 5,
@@ -167,24 +377,143 @@
}
],
"source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
+ "MS_no1 = ModelSpec(['ShelveLoc', 'Price'], intercept=False)\n",
+ "MS_no1.fit_transform(Carseats)[:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "54d8fd20-d8f5-44d6-9965-83e745680798",
+ "metadata": {},
+ "source": [
+ "We see that `ShelveLoc` still only contributes\n",
+ "two columns to the design. The `ModelSpec` object does no introspection of its arguments to effectively include an intercept term\n",
+ "in the column space of the design matrix.\n",
+ "\n",
+ "To include this intercept via `ShelveLoc` we can use 3 columns to encode this categorical variable. Following the nomenclature of\n",
+ "`R`, we call this a `Contrast` of the categorical variable."
]
},
{
"cell_type": "code",
"execution_count": 6,
- "id": "driven-employee",
+ "id": "555734bb-2682-4721-a1cd-6fb207394b0e",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ShelveLoc[Bad] | \n",
+ " ShelveLoc[Good] | \n",
+ " ShelveLoc[Medium] | \n",
+ " Price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 80 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 97 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 128 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 72 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 108 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
+ " ShelveLoc[Bad] ShelveLoc[Good] ShelveLoc[Medium] Price\n",
+ "0 1.0 0.0 0.0 120\n",
+ "1 0.0 1.0 0.0 83\n",
+ "2 0.0 0.0 1.0 80\n",
+ "3 0.0 0.0 1.0 97\n",
+ "4 1.0 0.0 0.0 128\n",
+ "5 1.0 0.0 0.0 72\n",
+ "6 0.0 0.0 1.0 108\n",
+ "7 0.0 1.0 0.0 120\n",
+ "8 0.0 0.0 1.0 124\n",
+ "9 0.0 0.0 1.0 124"
]
},
"execution_count": 6,
@@ -193,45 +522,32 @@
}
],
"source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
+ "from ISLP.models import contrast\n",
+ "shelve = contrast('ShelveLoc', None)\n",
+ "MS_contr = ModelSpec([shelve, 'Price'], intercept=False)\n",
+ "MS_contr.fit_transform(Carseats)[:10]"
]
},
{
"cell_type": "markdown",
- "id": "secondary-winner",
+ "id": "66db03cf-489c-40b6-8fac-762d66cf9932",
"metadata": {},
"source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
+ "This example above illustrates that columns need not be identified by name in `terms`. The basic\n",
+ "role of an item in the `terms` sequence is a description of how to extract a column\n",
+ "from a columnar data object, usually a `pd.DataFrame`."
]
},
{
"cell_type": "code",
"execution_count": 7,
- "id": "bored-making",
+ "id": "852ee40e-05d2-4785-ab7d-968fb087f3c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
+ "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=(), encoder=Contrast(method=None))"
]
},
"execution_count": 7,
@@ -240,28 +556,36 @@
}
],
"source": [
- "Carseats['ShelveLoc']"
+ "shelve"
]
},
{
"cell_type": "markdown",
- "id": "phantom-assurance",
+ "id": "b3be8808-1dbf-4154-882b-f61656a2ed4e",
"metadata": {},
"source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
+ "The `Column` object can be used to directly extract relevant columns from a `pd.DataFrame`. If the `encoder` field is not\n",
+ "`None`, then the extracted columns will be passed through `encoder`.\n",
+ "The `get_columns` method produces these columns as well as names for the columns."
]
},
{
"cell_type": "code",
"execution_count": 8,
- "id": "blind-harvest",
+ "id": "0ebadfc0-0ea2-4abc-aac6-ef78be227ce1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
+ "(array([[1., 0., 0.],\n",
+ " [0., 1., 0.],\n",
+ " [0., 0., 1.],\n",
+ " ...,\n",
+ " [0., 0., 1.],\n",
+ " [1., 0., 0.],\n",
+ " [0., 1., 0.]]),\n",
+ " ['ShelveLoc[Bad]', 'ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
]
},
"execution_count": 8,
@@ -270,27 +594,89 @@
}
],
"source": [
- "design.column_info_['ShelveLoc']"
+ "shelve.get_columns(Carseats)"
]
},
{
"cell_type": "markdown",
- "id": "suspended-affairs",
+ "id": "269e6d18-4ae4-4a77-8498-90281ae7c803",
"metadata": {},
"source": [
- "It recognized ordinal columns as well."
+ "Let's now fit a simple OLS model with this design."
]
},
{
"cell_type": "code",
"execution_count": 9,
- "id": "military-locking",
+ "id": "411238d0-dd36-4878-a869-e8ce0ada099c",
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " coef | \n",
+ " std err | \n",
+ " t | \n",
+ " P>|t| | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | ShelveLoc[Bad] | \n",
+ " 12.0018 | \n",
+ " 0.503 | \n",
+ " 23.839 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ShelveLoc[Good] | \n",
+ " 16.8976 | \n",
+ " 0.522 | \n",
+ " 32.386 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ShelveLoc[Medium] | \n",
+ " 13.8638 | \n",
+ " 0.487 | \n",
+ " 28.467 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | Price | \n",
+ " -0.0567 | \n",
+ " 0.004 | \n",
+ " -13.967 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " coef std err t P>|t|\n",
+ "ShelveLoc[Bad] 12.0018 0.503 23.839 0.0\n",
+ "ShelveLoc[Good] 16.8976 0.522 32.386 0.0\n",
+ "ShelveLoc[Medium] 13.8638 0.487 28.467 0.0\n",
+ "Price -0.0567 0.004 -13.967 0.0"
]
},
"execution_count": 9,
@@ -299,19 +685,166 @@
}
],
"source": [
- "design.column_info_['OIncome']"
+ "X = MS_contr.transform(Carseats)\n",
+ "Y = Carseats['Sales']\n",
+ "M_ols = sm.OLS(Y, X).fit()\n",
+ "summarize(M_ols)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "40ddf68e-7d58-4e30-93a8-5b7fe840d37a",
+ "metadata": {},
+ "source": [
+ "## Interactions\n",
+ "\n",
+ "One of the common uses of formulae in `R` is to specify interactions between variables.\n",
+ "This is done in `ModelSpec` by including a tuple in the `terms` argument."
]
},
{
"cell_type": "code",
"execution_count": 10,
- "id": "italic-shakespeare",
+ "id": "3f5e314c-7a7f-4e8d-bb07-295beb42c728",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " intercept | \n",
+ " ShelveLoc[Bad]:Price | \n",
+ " ShelveLoc[Good]:Price | \n",
+ " ShelveLoc[Medium]:Price | \n",
+ " Price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.0 | \n",
+ " 120.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 83.0 | \n",
+ " 0.0 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 80.0 | \n",
+ " 80 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 97.0 | \n",
+ " 97 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1.0 | \n",
+ " 128.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 128 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1.0 | \n",
+ " 72.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 72 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 108.0 | \n",
+ " 108 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 120.0 | \n",
+ " 0.0 | \n",
+ " 120 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 124.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 124.0 | \n",
+ " 124 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
+ " intercept ShelveLoc[Bad]:Price ShelveLoc[Good]:Price \\\n",
+ "0 1.0 120.0 0.0 \n",
+ "1 1.0 0.0 83.0 \n",
+ "2 1.0 0.0 0.0 \n",
+ "3 1.0 0.0 0.0 \n",
+ "4 1.0 128.0 0.0 \n",
+ "5 1.0 72.0 0.0 \n",
+ "6 1.0 0.0 0.0 \n",
+ "7 1.0 0.0 120.0 \n",
+ "8 1.0 0.0 0.0 \n",
+ "9 1.0 0.0 0.0 \n",
+ "\n",
+ " ShelveLoc[Medium]:Price Price \n",
+ "0 0.0 120 \n",
+ "1 0.0 83 \n",
+ "2 80.0 80 \n",
+ "3 97.0 97 \n",
+ "4 0.0 128 \n",
+ "5 0.0 72 \n",
+ "6 108.0 108 \n",
+ "7 0.0 120 \n",
+ "8 124.0 124 \n",
+ "9 124.0 124 "
]
},
"execution_count": 10,
@@ -320,65 +853,71 @@
}
],
"source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
+ "ModelSpec([(shelve, 'Price'), 'Price']).fit_transform(Carseats).iloc[:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f85fcb2-f0ef-4c1b-a89f-fcf083937274",
+ "metadata": {},
+ "source": [
+ "The above design matrix is clearly rank deficient, as `ModelSpec` has not inspected the formula\n",
+ "and attempted to produce a corresponding matrix that may or may not match a user's intent."
]
},
{
"cell_type": "markdown",
- "id": "medieval-speed",
+ "id": "excellent-hamilton",
"metadata": {},
"source": [
- "## Encoding a column\n",
+ "## Ordinal variables\n",
"\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
+ "Ordinal variables are handled by a corresponding encoder)"
]
},
{
"cell_type": "code",
"execution_count": 11,
- "id": "public-basket",
+ "id": "going-administrator",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
+ " [0,50,90,200], \n",
+ " labels=['L','M','H'])\n",
+ "MS_order = ModelSpec(['OIncome']).fit(Carseats)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e1defb1-071b-4751-9358-b8d2f0b3412e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
+ "Part of the `fit` method of `ModelSpec` involves inspection of the columns of `Carseats`. \n",
+ "The results of that inspection can be found in the `column_info_` attribute:"
]
},
{
"cell_type": "code",
"execution_count": 12,
- "id": "improved-alloy",
+ "id": "050fb4ae-648d-429d-9cb2-8423ad9707d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
+ "{'Sales': Column(idx='Sales', name='Sales', is_categorical=False, is_ordinal=False, columns=('Sales',), encoder=None),\n",
+ " 'CompPrice': Column(idx='CompPrice', name='CompPrice', is_categorical=False, is_ordinal=False, columns=('CompPrice',), encoder=None),\n",
+ " 'Income': Column(idx='Income', name='Income', is_categorical=False, is_ordinal=False, columns=('Income',), encoder=None),\n",
+ " 'Advertising': Column(idx='Advertising', name='Advertising', is_categorical=False, is_ordinal=False, columns=('Advertising',), encoder=None),\n",
+ " 'Population': Column(idx='Population', name='Population', is_categorical=False, is_ordinal=False, columns=('Population',), encoder=None),\n",
+ " 'Price': Column(idx='Price', name='Price', is_categorical=False, is_ordinal=False, columns=('Price',), encoder=None),\n",
+ " 'ShelveLoc': Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast()),\n",
+ " 'Age': Column(idx='Age', name='Age', is_categorical=False, is_ordinal=False, columns=('Age',), encoder=None),\n",
+ " 'Education': Column(idx='Education', name='Education', is_categorical=False, is_ordinal=False, columns=('Education',), encoder=None),\n",
+ " 'Urban': Column(idx='Urban', name='Urban', is_categorical=True, is_ordinal=False, columns=('Urban[Yes]',), encoder=Contrast()),\n",
+ " 'US': Column(idx='US', name='US', is_categorical=True, is_ordinal=False, columns=('US[Yes]',), encoder=Contrast()),\n",
+ " 'OIncome': Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())}"
]
},
"execution_count": 12,
@@ -387,33 +926,32 @@
}
],
"source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
+ "MS_order.column_info_"
]
},
{
"cell_type": "markdown",
- "id": "frank-mathematics",
+ "id": "debf7e2e-0a9d-451b-866c-66c0df9f43e5",
"metadata": {},
"source": [
- "## The terms\n",
+ "## Structure of a `ModelSpec`\n",
"\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
+ "The first argument to `ModelSpec` is stored as the `terms` attribute. Under the hood,\n",
+ "this sequence is inspected to produce the `terms_` attribute which specify the objects\n",
+ "that will ultimately create the design matrix."
]
},
{
"cell_type": "code",
"execution_count": 13,
- "id": "together-north",
+ "id": "ea51e988-0857-4d49-9987-d7531b34a233",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['Price', 'Income']"
+ "[Feature(variables=('ShelveLoc',), name='ShelveLoc', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
+ " Feature(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
]
},
"execution_count": 13,
@@ -422,64 +960,145 @@
}
],
"source": [
- "design.terms"
+ "MS = ModelSpec(['ShelveLoc', 'Price'])\n",
+ "MS.fit(Carseats)\n",
+ "MS.terms_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "warming-mobile",
+ "metadata": {},
+ "source": [
+ "Each element of `terms_` should be a `Feature` which describes a set of columns to be extracted from\n",
+ "a columnar data form as well as possible a possible encoder."
]
},
{
"cell_type": "code",
"execution_count": 14,
- "id": "chinese-necessity",
+ "id": "59214a70-1e6b-41c4-9f44-a92d340723c9",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "design.terms_"
+ "shelve_var = MS.terms_[0]"
]
},
{
"cell_type": "markdown",
- "id": "simplified-chinese",
+ "id": "5fed3ea2-ff50-4e5d-819d-a948f121f9d3",
"metadata": {},
"source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
+ "We can find the columns associated to each term using the `build_columns` method of `ModelSpec`:"
]
},
{
"cell_type": "code",
"execution_count": 15,
- "id": "automotive-hobby",
+ "id": "5e25ef64-497d-4f42-9f20-3d4a320cda23",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ShelveLoc[Good] | \n",
+ " ShelveLoc[Medium] | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 395 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 396 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 397 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 398 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 399 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
400 rows × 2 columns
\n",
+ "
"
+ ],
"text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
+ " ShelveLoc[Good] ShelveLoc[Medium]\n",
+ "0 0.0 0.0\n",
+ "1 1.0 0.0\n",
+ "2 0.0 1.0\n",
+ "3 0.0 1.0\n",
+ "4 0.0 0.0\n",
+ ".. ... ...\n",
+ "395 1.0 0.0\n",
+ "396 0.0 1.0\n",
+ "397 0.0 1.0\n",
+ "398 0.0 0.0\n",
+ "399 1.0 0.0\n",
+ "\n",
+ "[400 rows x 2 columns]"
]
},
"execution_count": 15,
@@ -488,280 +1107,37 @@
}
],
"source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
+ "df, names = build_columns(MS.column_info_,\n",
+ " Carseats, \n",
+ " shelve_var)\n",
+ "df"
]
},
{
"cell_type": "markdown",
- "id": "former-spring",
- "metadata": {},
- "source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "floral-liabilities",
+ "id": "63edf7a2-e776-45b0-b434-d676d7e13dbd",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
+ "The design matrix is constructed by running through `terms_` and concatenating the corresponding columns."
]
},
{
"cell_type": "markdown",
- "id": "reasonable-canadian",
+ "id": "former-spring",
"metadata": {},
"source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
+ "### `Feature` objects\n",
+ "\n",
+ "Note that `Feature` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
+ "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
+ "is run through `encoder.transform`. The `encoder.fit` method of each `Feature` is run once during \n",
+ "the call to `ModelSpec.fit`."
]
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "imported-measure",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "institutional-burden",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "western-bloom",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ordinary-newman",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "modern-negotiation",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "private-shepherd",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "hollywood-union",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "suffering-lover",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "successful-express",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "banner-metadata",
+ "execution_count": 18,
+ "id": "floral-liabilities",
"metadata": {},
"outputs": [
{
@@ -785,1227 +1161,643 @@
" \n",
" \n",
" | \n",
- " intercept | \n",
" Price | \n",
" Income | \n",
+ " OIncome | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
+ " 120.0 | \n",
+ " 73.0 | \n",
+ " 2.0 | \n",
"
\n",
" \n",
" | 1 | \n",
+ " 83.0 | \n",
+ " 48.0 | \n",
" 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
"
\n",
" \n",
" | 2 | \n",
+ " 80.0 | \n",
+ " 35.0 | \n",
" 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
"
\n",
" \n",
" | 3 | \n",
+ " 97.0 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 128.0 | \n",
+ " 64.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 395 | \n",
+ " 128.0 | \n",
+ " 108.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 396 | \n",
+ " 120.0 | \n",
+ " 23.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 397 | \n",
+ " 159.0 | \n",
+ " 26.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 398 | \n",
+ " 95.0 | \n",
+ " 79.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 399 | \n",
+ " 120.0 | \n",
+ " 37.0 | \n",
" 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
"
\n",
" \n",
"\n",
+ "400 rows × 3 columns
\n",
""
],
"text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "absent-branch",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "naked-hollywood",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
+ " Price Income OIncome\n",
+ "0 120.0 73.0 2.0\n",
+ "1 83.0 48.0 1.0\n",
+ "2 80.0 35.0 1.0\n",
+ "3 97.0 100.0 0.0\n",
+ "4 128.0 64.0 2.0\n",
+ ".. ... ... ...\n",
+ "395 128.0 108.0 0.0\n",
+ "396 120.0 23.0 1.0\n",
+ "397 159.0 26.0 1.0\n",
+ "398 95.0 79.0 2.0\n",
+ "399 120.0 37.0 1.0\n",
+ "\n",
+ "[400 rows x 3 columns]"
]
},
- "execution_count": 23,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "iraqi-divorce",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
+ "new_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)\n",
+ "build_columns(MS.column_info_,\n",
+ " Carseats, \n",
+ " new_var)[0]"
]
},
{
"cell_type": "markdown",
- "id": "signal-yahoo",
+ "id": "reasonable-canadian",
"metadata": {},
"source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
+ "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
+ "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
]
},
{
"cell_type": "code",
- "execution_count": 25,
- "id": "completed-surveillance",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "undefined-sacrifice",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "incredible-concert",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "allied-botswana",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "stunning-container",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "specific-tobacco",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "latin-publisher",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "rocky-franchise",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "returning-matthew",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "sapphire-adelaide",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "standing-involvement",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "taken-university",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "rural-cycling",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "former-trick",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "specialized-processing",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "verified-administrator",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "limited-johns",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "saving-remainder",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "satisfied-harbor",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "silver-wesley",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "crazy-bikini",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "accredited-barrier",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "smaller-execution",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "limited-center",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "combined-relaxation",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "focal-determination",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "earned-ready",
+ "execution_count": 20,
+ "id": "imported-measure",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mynewvar[0] | \n",
+ " mynewvar[1] | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " -3.595740 | \n",
+ " -4.850530 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 15.070401 | \n",
+ " 35.706773 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 27.412228 | \n",
+ " 40.772377 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " -33.983048 | \n",
+ " 13.468087 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 6.580644 | \n",
+ " -11.287452 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 395 | \n",
+ " -36.856308 | \n",
+ " -18.418138 | \n",
+ "
\n",
+ " \n",
+ " | 396 | \n",
+ " 45.731520 | \n",
+ " 3.243768 | \n",
+ "
\n",
+ " \n",
+ " | 397 | \n",
+ " 49.087659 | \n",
+ " -35.727136 | \n",
+ "
\n",
+ " \n",
+ " | 398 | \n",
+ " -13.565178 | \n",
+ " 18.847760 | \n",
+ "
\n",
+ " \n",
+ " | 399 | \n",
+ " 31.917072 | \n",
+ " 0.976615 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
400 rows × 2 columns
\n",
+ "
"
+ ],
"text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
+ " mynewvar[0] mynewvar[1]\n",
+ "0 -3.595740 -4.850530\n",
+ "1 15.070401 35.706773\n",
+ "2 27.412228 40.772377\n",
+ "3 -33.983048 13.468087\n",
+ "4 6.580644 -11.287452\n",
+ ".. ... ...\n",
+ "395 -36.856308 -18.418138\n",
+ "396 45.731520 3.243768\n",
+ "397 49.087659 -35.727136\n",
+ "398 -13.565178 18.847760\n",
+ "399 31.917072 0.976615\n",
+ "\n",
+ "[400 rows x 2 columns]"
]
},
- "execution_count": 41,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
+ "from sklearn.decomposition import PCA\n",
+ "pca = PCA(n_components=2)\n",
+ "pca.fit(build_columns(MS.column_info_, Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
+ "pca_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)\n",
+ "build_columns(MS.column_info_,\n",
+ " Carseats, \n",
+ " pca_var)[0]"
]
},
{
"cell_type": "markdown",
- "id": "prescribed-accessory",
+ "id": "institutional-burden",
"metadata": {},
"source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
+ "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
+ "or `Feature` instances (`pca_var`)."
]
},
{
"cell_type": "code",
- "execution_count": 42,
- "id": "pacific-animal",
+ "execution_count": 21,
+ "id": "western-bloom",
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Income | \n",
+ " Price | \n",
+ " mynewvar[0] | \n",
+ " mynewvar[1] | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 73.0 | \n",
+ " 120.0 | \n",
+ " -3.595740 | \n",
+ " -4.850530 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 48.0 | \n",
+ " 83.0 | \n",
+ " 15.070401 | \n",
+ " 35.706773 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 35.0 | \n",
+ " 80.0 | \n",
+ " 27.412228 | \n",
+ " 40.772377 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 100.0 | \n",
+ " 97.0 | \n",
+ " -33.983048 | \n",
+ " 13.468087 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 64.0 | \n",
+ " 128.0 | \n",
+ " 6.580644 | \n",
+ " -11.287452 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 395 | \n",
+ " 108.0 | \n",
+ " 128.0 | \n",
+ " -36.856308 | \n",
+ " -18.418138 | \n",
+ "
\n",
+ " \n",
+ " | 396 | \n",
+ " 23.0 | \n",
+ " 120.0 | \n",
+ " 45.731520 | \n",
+ " 3.243768 | \n",
+ "
\n",
+ " \n",
+ " | 397 | \n",
+ " 26.0 | \n",
+ " 159.0 | \n",
+ " 49.087659 | \n",
+ " -35.727136 | \n",
+ "
\n",
+ " \n",
+ " | 398 | \n",
+ " 79.0 | \n",
+ " 95.0 | \n",
+ " -13.565178 | \n",
+ " 18.847760 | \n",
+ "
\n",
+ " \n",
+ " | 399 | \n",
+ " 37.0 | \n",
+ " 120.0 | \n",
+ " 31.917072 | \n",
+ " 0.976615 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
400 rows × 4 columns
\n",
+ "
"
+ ],
"text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
+ " Income Price mynewvar[0] mynewvar[1]\n",
+ "0 73.0 120.0 -3.595740 -4.850530\n",
+ "1 48.0 83.0 15.070401 35.706773\n",
+ "2 35.0 80.0 27.412228 40.772377\n",
+ "3 100.0 97.0 -33.983048 13.468087\n",
+ "4 64.0 128.0 6.580644 -11.287452\n",
+ ".. ... ... ... ...\n",
+ "395 108.0 128.0 -36.856308 -18.418138\n",
+ "396 23.0 120.0 45.731520 3.243768\n",
+ "397 26.0 159.0 49.087659 -35.727136\n",
+ "398 79.0 95.0 -13.565178 18.847760\n",
+ "399 37.0 120.0 31.917072 0.976615\n",
+ "\n",
+ "[400 rows x 4 columns]"
]
},
- "execution_count": 42,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design.terms_[0]"
+ "price = MS.column_info_['Price']\n",
+ "fancy_var = Feature(('Income', price, pca_var), name='fancy', encoder=None)\n",
+ "build_columns(MS.column_info_,\n",
+ " Carseats, \n",
+ " fancy_var)[0]"
]
},
{
"cell_type": "markdown",
- "id": "planned-wrestling",
+ "id": "e289feba-e3f5-48e0-9e29-cdd88d7f9923",
"metadata": {},
"source": [
- "Comparing this to the previous `R` model."
+ "## Predicting at new points"
]
},
{
"cell_type": "code",
- "execution_count": 43,
- "id": "given-testimony",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "external-barrier",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "authentic-meditation",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "lucky-success",
+ "execution_count": 22,
+ "id": "6efed2fa-9e5d-429c-a8d9-ac544cab2b41",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
+ "intercept 12.661546\n",
+ "Price -0.052213\n",
+ "Income 0.012829\n",
"dtype: float64"
]
},
- "execution_count": 45,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
+ "MS = ModelSpec(['Price', 'Income']).fit(Carseats)\n",
+ "X = MS.transform(Carseats)\n",
+ "Y = Carseats['Sales']\n",
+ "M_ols = sm.OLS(Y, X).fit()\n",
+ "M_ols.params"
]
},
{
"cell_type": "markdown",
- "id": "laden-beach",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "copyrighted-luther",
+ "id": "e6b4609b-fcb2-4cc2-b630-509df4c87546",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3)', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
+ "As `ModelSpec` is a transformer, it can be evaluated at new feature values.\n",
+ "Constructing the design matrix at any values is carried out by the `transform` method."
]
},
{
"cell_type": "code",
- "execution_count": 47,
- "id": "threatened-marine",
+ "execution_count": 23,
+ "id": "8784b0e8-ce53-4a90-aee6-b935834295c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3)[0] 10.036373\n",
- "poly(Income, 3)[1] -2.799156\n",
- "poly(Income, 3)[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
+ "array([10.70130676, 10.307465 ])"
]
},
- "execution_count": 47,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
+ "new_data = pd.DataFrame({'Price':[40, 50], 'Income':[10, 20]})\n",
+ "new_X = MS.transform(new_data)\n",
+ "M_ols.get_prediction(new_X).predicted_mean"
]
},
{
"cell_type": "markdown",
- "id": "senior-spokesman",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "prompt-fifteen",
+ "id": "signal-yahoo",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
"source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
+ "## Using `np.ndarray`\n",
+ "\n",
+ "As the basic model is to concatenate columns extracted from a columnar data\n",
+ "representation, one *can* use `np.ndarray` as the column data. In this case,\n",
+ "columns will be selected by integer indices. \n",
+ "\n",
+ "### Caveats using `np.ndarray`\n",
+ "\n",
+ "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
+ "However,\n",
+ "unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.\n",
+ "\n",
+ "However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
+ "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. \n",
+ "\n",
+ "We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new\n",
+ "values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties\n",
+ "in using `np.ndarray`."
]
},
{
"cell_type": "markdown",
- "id": "better-christianity",
+ "id": "e7ffdd07-4d6b-4a4c-ab38-ad1270e85de6",
"metadata": {},
"source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
+ "We will refit this model, using `ModelSpec` with an `np.ndarray` instead"
]
},
{
"cell_type": "code",
- "execution_count": 49,
- "id": "outstanding-performer",
+ "execution_count": 24,
+ "id": "4fec9030-7445-48be-a15f-2ac5a789e717",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "intercept 4.240421\n",
- "ns(Income, df=5)[0] 1.468196\n",
- "ns(Income, df=5)[1] 1.499471\n",
- "ns(Income, df=5)[2] 1.152070\n",
- "ns(Income, df=5)[3] 2.418398\n",
- "ns(Income, df=5)[4] 1.804460\n",
- "ShelveLoc[Good] 4.810449\n",
- "ShelveLoc[Medium] 1.881095\n",
- "dtype: float64"
+ "array([[ 1., 120., 73.],\n",
+ " [ 1., 83., 48.],\n",
+ " [ 1., 80., 35.],\n",
+ " ...,\n",
+ " [ 1., 159., 26.],\n",
+ " [ 1., 95., 79.],\n",
+ " [ 1., 120., 37.]])"
]
},
- "execution_count": 49,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from ISLP.models.model_spec import ns, bs, pca\n",
- "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "informative-spirituality",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n",
- " 4.240421 1.468196 1.499471 1.152070 \n",
- "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n",
- " 2.418398 1.804460 4.810449 1.881095 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "library(splines)\n",
- "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef"
+ "Carseats_np = np.asarray(Carseats[['Price', 'Education', 'Income']])\n",
+ "MS_np = ModelSpec([0,2]).fit(Carseats_np)\n",
+ "MS_np.transform(Carseats_np)"
]
},
{
"cell_type": "code",
- "execution_count": 51,
- "id": "destroyed-complexity",
+ "execution_count": 25,
+ "id": "c864e365-2476-4ca6-9d27-625cac2b2271",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "intercept 3.495085\n",
- "bs(Income, df=7, degree=2)[0] 1.813118\n",
- "bs(Income, df=7, degree=2)[1] 0.961852\n",
- "bs(Income, df=7, degree=2)[2] 2.471545\n",
- "bs(Income, df=7, degree=2)[3] 2.158891\n",
- "bs(Income, df=7, degree=2)[4] 2.091625\n",
- "bs(Income, df=7, degree=2)[5] 2.600669\n",
- "bs(Income, df=7, degree=2)[6] 2.843108\n",
- "ShelveLoc[Good] 4.804919\n",
- "ShelveLoc[Medium] 1.880337\n",
+ "const 12.661546\n",
+ "x1 -0.052213\n",
+ "x2 0.012829\n",
"dtype: float64"
]
},
- "execution_count": 51,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "incident-nicaragua",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) bs(Income, df = 7, degree = 2)1 \n",
- " 3.4950851 1.8131176 \n",
- "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n",
- " 0.9618523 2.4715450 \n",
- "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n",
- " 2.1588908 2.0916252 \n",
- "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n",
- " 2.6006694 2.8431084 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.8049190 1.8803375 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef"
+ "M_ols_np = sm.OLS(Y, MS_np.transform(Carseats_np)).fit()\n",
+ "M_ols_np.params"
]
},
{
"cell_type": "markdown",
- "id": "formal-medline",
- "metadata": {},
- "source": [
- "## PCA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "general-joshua",
+ "id": "undefined-sacrifice",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.419405\n",
- "pca(myvars, n_components=2)[0] -0.001131\n",
- "pca(myvars, n_components=2)[1] -0.024217\n",
- "ShelveLoc[Good] 4.816253\n",
- "ShelveLoc[Medium] 1.924139\n",
- "dtype: float64"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars'), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
+ "Now, let's consider finding the design matrix at new points. \n",
+ "When using `pd.DataFrame` we only need to supply the `transform` method\n",
+ "a data frame with columns implicated in the `terms` argument (in this case, `Price` and `Income`). \n",
+ "\n",
+ "However, when using `np.ndarray` with integers as indices, `Price` was column 0 and `Income` was column 2. The only\n",
+ "sensible way to produce a return for predict is to extract its 0th and 2nd columns. Note this means\n",
+ "that the meaning of columns in an `np.ndarray` provided to `transform` essentially must be identical to those\n",
+ "passed to `fit`."
]
},
{
"cell_type": "code",
- "execution_count": 54,
- "id": "coordinate-calcium",
+ "execution_count": 26,
+ "id": "incredible-concert",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.419405 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n",
- " 0.001131 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n",
- " -0.024217 \n",
- " ShelveLocGood \n",
- " 4.816253 \n",
- " ShelveLocMedium \n",
- " 1.924139 \n",
- "\n"
+ "index 2 is out of bounds for axis 1 with size 2\n"
]
}
],
"source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)"
+ "try:\n",
+ " new_D = np.array([[40,50], [10,20]]).T\n",
+ " new_X = MS_np.transform(new_D)\n",
+ "except IndexError as e:\n",
+ " print(e)"
]
},
{
"cell_type": "markdown",
- "id": "foster-canvas",
+ "id": "allied-botswana",
"metadata": {},
"source": [
- "It is of course common to scale before running PCA."
+ "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
+ "with a matrix having 3 columns (the first representing an intercept).\n",
+ "\n",
+ "We might be tempted to try as with the `pd.DataFrame` and produce\n",
+ "an `np.ndarray` with only the necessary variables."
]
},
{
"cell_type": "code",
- "execution_count": 55,
- "id": "geographic-founder",
+ "execution_count": 27,
+ "id": "stunning-container",
"metadata": {},
"outputs": [
{
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
+ "[[ 1. 40. 10.]\n",
+ " [ 1. 50. 20.]]\n"
]
},
{
"data": {
"text/plain": [
- "intercept 5.352159\n",
- "pca(myvars, n_components=2)[0] 0.446383\n",
- "pca(myvars, n_components=2)[1] -1.219788\n",
- "ShelveLoc[Good] 4.922780\n",
- "ShelveLoc[Medium] 2.005617\n",
- "dtype: float64"
+ "array([10.70130676, 10.307465 ])"
]
},
- "execution_count": 55,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars',\n",
- " scale=True), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "id": "floral-packaging",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.3522 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n",
- " 0.4469 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n",
- " -1.2213 \n",
- " ShelveLocGood \n",
- " 4.9228 \n",
- " ShelveLocMedium \n",
- " 2.0056 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)"
+ "new_D = np.array([[40,50], [np.nan, np.nan], [10,20]]).T\n",
+ "new_X = MS_np.transform(new_D)\n",
+ "print(new_X)\n",
+ "M_ols.get_prediction(new_X).predicted_mean"
]
},
{
"cell_type": "markdown",
- "id": "social-cherry",
- "metadata": {},
- "source": [
- "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n",
- "of `np.std(ddof=1)`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "another-glory",
+ "id": "specific-tobacco",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.44694166, -1.22131519])"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))"
+ "For more complicated design contructions ensuring the columns of `new_D` match that of the original data will be more cumbersome. We expect\n",
+ "then that `pd.DataFrame` (or a columnar data representation with similar API) will likely be easier to use with `ModelSpec`."
]
}
],
@@ -2014,9 +1806,9 @@
"formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -2028,7 +1820,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.10.10"
}
},
"nbformat": 4,
diff --git a/docs/jupyterbook/models/spec.md b/docs/jupyterbook/models/spec.md
index fdf8c60..27bb3a4 100644
--- a/docs/jupyterbook/models/spec.md
+++ b/docs/jupyterbook/models/spec.md
@@ -5,490 +5,296 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: Python 3 (ipykernel)
language: python
- name: islp_test
+ name: python3
---
# Building design matrices with `ModelSpec`
-Force rebuild
+The `ISLP` package provides a facility to build design
+matrices for regression and classification tasks. It provides similar functionality to the formula
+notation of `R` though uses python objects rather than specification through the special formula syntax.
+
+Related tools include `patsy` and `ColumnTransformer` from `sklearn.compose`.
+
+Perhaps the most common use is to extract some columns from a `pd.DataFrame` and
+produce a design matrix, optionally with an intercept.
```{code-cell} ipython3
-x=4
-import numpy as np, pandas as pd
-%load_ext rpy2.ipython
+import pandas as pd
+import numpy as np
from ISLP import load_data
-from ISLP.models import ModelSpec
+from ISLP.models import (ModelSpec,
+ summarize,
+ Column,
+ Feature,
+ build_columns)
import statsmodels.api as sm
```
```{code-cell} ipython3
Carseats = load_data('Carseats')
-%R -i Carseats
Carseats.columns
```
-## Let's break up income into groups
+We'll first build a design matrix that we can use to model `Sales`
+in terms of the categorical variable `ShelveLoc` and `Price`.
-```{code-cell} ipython3
-Carseats['OIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'])
-Carseats['OIncome']
-```
-
-Let's also create an unordered version
-
-```{code-cell} ipython3
-Carseats['UIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'],
- ordered=False)
-Carseats['UIncome']
-```
-
-## A simple model
-
-```{code-cell} ipython3
-design = ModelSpec(['Price', 'Income'])
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-Y = Carseats['Sales']
-M = sm.OLS(Y, X).fit()
-M.params
-```
-
-## Basic procedure
-
-The design matrix is built by cobbling together a set of columns and possibly transforming them.
-A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`
-is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:
+We see first that `ShelveLoc` is a categorical variable:
```{code-cell} ipython3
Carseats['ShelveLoc']
```
-This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods
-`get_columns` and `fit_encoder`.
-
-```{code-cell} ipython3
-design.column_info_['ShelveLoc']
-```
-
-It recognized ordinal columns as well.
-
-```{code-cell} ipython3
-design.column_info_['OIncome']
-```
-
-```{code-cell} ipython3
-income = design.column_info_['Income']
-cols, names = income.get_columns(Carseats)
-(cols[:4], names)
-```
-
-## Encoding a column
-
-In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical
-variables usually are encoded by several columns, typically one less than the number of categories.
-This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform
-model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits
-its encoder the first time data is passed to it.
-
-```{code-cell} ipython3
-shelve = design.column_info_['ShelveLoc']
-cols, names = shelve.get_columns(Carseats)
-(cols[:4], names)
-```
-
-```{code-cell} ipython3
-oincome = design.column_info_['OIncome']
-oincome.get_columns(Carseats)[0][:4]
-```
-
-## The terms
+This is recognized by `ModelSpec` and only 2 columns are added for the three levels. The
+default behavior is to drop the first level of the categories. Later,
+we will show other contrasts of the 3 columns can be produced.
-The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through
-the `terms` argument which should be a sequence. The elements of `terms` are often
-going to be strings (or tuples of strings for interactions, see below) but are converted to a
-`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple.
+This simple example below illustrates how the first argument (its `terms`) is
+used to construct a design matrix.
```{code-cell} ipython3
-design.terms
+MS = ModelSpec(['ShelveLoc', 'Price'])
+X = MS.fit_transform(Carseats)
+X.iloc[:10]
```
-```{code-cell} ipython3
-design.terms_
-```
-
-While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A
-`Variable` can also create columns through the `build_columns` method of `ModelSpec`
+We note that a column has been added for the intercept by default. This can be changed using the
+`intercept` argument.
```{code-cell} ipython3
-price = design.terms_[0]
-design.build_columns(Carseats, price)
+MS_no1 = ModelSpec(['ShelveLoc', 'Price'], intercept=False)
+MS_no1.fit_transform(Carseats)[:10]
```
-Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The
-tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then
-is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during
-the call to `ModelSpec.fit`.
+We see that `ShelveLoc` still only contributes
+two columns to the design. The `ModelSpec` object does no introspection of its arguments to effectively include an intercept term
+in the column space of the design matrix.
-```{code-cell} ipython3
-from ISLP.models.model_spec import Variable
-
-new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)
-design.build_columns(Carseats, new_var)
-```
-
-Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the
-arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`.
+To include this intercept via `ShelveLoc` we can use 3 columns to encode this categorical variable. Following the nomenclature of
+`R`, we call this a `Contrast` of the categorical variable.
```{code-cell} ipython3
-from sklearn.decomposition import PCA
-pca = PCA(n_components=2)
-pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`
-pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)
-design.build_columns(Carseats, pca_var)
-```
-
-The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`)
-or `Variable` instances (`pca_var`).
-
-```{code-cell} ipython3
-fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)
-design.build_columns(Carseats, fancy_var)
+from ISLP.models import contrast
+shelve = contrast('ShelveLoc', None)
+MS_contr = ModelSpec([shelve, 'Price'], intercept=False)
+MS_contr.fit_transform(Carseats)[:10]
```
-We can of course run PCA again on these features (if we wanted).
+This example above illustrates that columns need not be identified by name in `terms`. The basic
+role of an item in the `terms` sequence is a description of how to extract a column
+from a columnar data object, usually a `pd.DataFrame`.
```{code-cell} ipython3
-pca2 = PCA(n_components=2)
-pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`
-pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)
-design.build_columns(Carseats, pca2_var)
+shelve
```
-## Building the design matrix
-
-With these notions in mind, the final design is essentially then
+The `Column` object can be used to directly extract relevant columns from a `pd.DataFrame`. If the `encoder` field is not
+`None`, then the extracted columns will be passed through `encoder`.
+The `get_columns` method produces these columns as well as names for the columns.
```{code-cell} ipython3
-X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]
+shelve.get_columns(Carseats)
```
-An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is
-a dataframe the index is adjusted accordingly.
+Let's now fit a simple OLS model with this design.
```{code-cell} ipython3
-design.intercept
-```
-
-```{code-cell} ipython3
-design.transform(Carseats)[:4]
+X = MS_contr.transform(Carseats)
+Y = Carseats['Sales']
+M_ols = sm.OLS(Y, X).fit()
+summarize(M_ols)
```
-## Predicting
+## Interactions
-Constructing the design matrix at any values is carried out by the `transform` method.
+One of the common uses of formulae in `R` is to specify interactions between variables.
+This is done in `ModelSpec` by including a tuple in the `terms` argument.
```{code-cell} ipython3
-new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})
-new_X = design.transform(new_data)
-M.get_prediction(new_X).predicted_mean
+ModelSpec([(shelve, 'Price'), 'Price']).fit_transform(Carseats).iloc[:10]
```
-```{code-cell} ipython3
-%%R -i new_data,Carseats
-predict(lm(Sales ~ Price + Income, data=Carseats), new_data)
-```
+The above design matrix is clearly rank deficient, as `ModelSpec` has not inspected the formula
+and attempted to produce a corresponding matrix that may or may not match a user's intent.
-### Difference between using `pd.DataFrame` and `np.ndarray`
++++
-If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.
+## Ordinal variables
-If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,
-in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning.
+Ordinal variables are handled by a corresponding encoder)
```{code-cell} ipython3
-Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])
-design_np = ModelSpec([0,3]).fit(Carseats_np)
-design_np.transform(Carseats_np)[:4]
+Carseats['OIncome'] = pd.cut(Carseats['Income'],
+ [0,50,90,200],
+ labels=['L','M','H'])
+MS_order = ModelSpec(['OIncome']).fit(Carseats)
```
-The following will fail for hopefully obvious reasons
+Part of the `fit` method of `ModelSpec` involves inspection of the columns of `Carseats`.
+The results of that inspection can be found in the `column_info_` attribute:
```{code-cell} ipython3
-try:
- new_D = np.zeros((2,2))
- new_D[:,0] = [10,20]
- new_D[:,1] = [40,50]
- M.get_prediction(new_D).predicted_mean
-except ValueError as e:
- print(e)
+MS_order.column_info_
```
-Ultimately, `M` expects 3 columns for new predictions because it was fit
-with a matrix having 3 columns (the first representing an intercept).
+## Structure of a `ModelSpec`
-We might be tempted to try as with the `pd.DataFrame` and produce
-an `np.ndarray` with only the necessary variables.
+The first argument to `ModelSpec` is stored as the `terms` attribute. Under the hood,
+this sequence is inspected to produce the `terms_` attribute which specify the objects
+that will ultimately create the design matrix.
```{code-cell} ipython3
-try:
- new_X = np.zeros((2,2))
- new_X[:,0] = [10,20]
- new_X[:,1] = [40,50]
- new_D = design_np.transform(new_X)
- M.get_prediction(new_D).predicted_mean
-except IndexError as e:
- print(e)
+MS = ModelSpec(['ShelveLoc', 'Price'])
+MS.fit(Carseats)
+MS.terms_
```
-This fails because `design_np` is looking for column `3` from its `terms`:
+Each element of `terms_` should be a `Feature` which describes a set of columns to be extracted from
+a columnar data form as well as possible a possible encoder.
```{code-cell} ipython3
-design_np.terms_
+shelve_var = MS.terms_[0]
```
-However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed
-represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:
+We can find the columns associated to each term using the `build_columns` method of `ModelSpec`:
```{code-cell} ipython3
-new_X = np.zeros((2,4))
-new_X[:,0] = [10,20]
-new_X[:,3] = [40,50]
-new_D = design_np.transform(new_X)
-M.get_prediction(new_D).predicted_mean
+df, names = build_columns(MS.column_info_,
+ Carseats,
+ shelve_var)
+df
```
-Given this subtlety about needing to supply arrays with identical column structure to `transform` when
-using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case.
+The design matrix is constructed by running through `terms_` and concatenating the corresponding columns.
+++
-## A model with some categorical variables
+### `Feature` objects
-Categorical variables become `Column` instances with encoders.
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)
-design.column_info_['UIncome']
-```
-
-```{code-cell} ipython3
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Getting the encoding you want
-
-By default the level dropped by `ModelSpec` will be the first of the `categories_` values from
-`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems
-as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`
-where `new_encoder` would somehow drop the column we want dropped.
-
-However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:
-
-```{code-cell} ipython3
-design.column_info_['UIncome']
-```
-
-This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when
-we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest
-and we need a way to allow different encodings of the same column of `Carseats`
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
- We can create a new
-`Column` with the encoder we want. For categorical variables, there is a convenience function to do so.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import contrast
-pref_encoding = contrast('UIncome', 'drop', 'L')
-```
-
-```{code-cell} ipython3
-design.build_columns(Carseats, pref_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
+Note that `Feature` objects have a tuple of `variables` as well as an `encoder` attribute. The
+tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then
+is run through `encoder.transform`. The `encoder.fit` method of each `Feature` is run once during
+the call to `ModelSpec.fit`.
```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
+new_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)
+build_columns(MS.column_info_,
+ Carseats,
+ new_var)[0]
```
-## Interactions
-
-We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument
-to `ModelSpec`.
+Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the
+arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`.
```{code-cell} ipython3
-design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
+from sklearn.decomposition import PCA
+pca = PCA(n_components=2)
+pca.fit(build_columns(MS.column_info_, Carseats, new_var)[0]) # this is done within `ModelSpec.fit`
+pca_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)
+build_columns(MS.column_info_,
+ Carseats,
+ pca_var)[0]
```
-The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with
-`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`.
+The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`)
+or `Feature` instances (`pca_var`).
```{code-cell} ipython3
-design.terms_[0]
+price = MS.column_info_['Price']
+fancy_var = Feature(('Income', price, pca_var), name='fancy', encoder=None)
+build_columns(MS.column_info_,
+ Carseats,
+ fancy_var)[0]
```
-Comparing this to the previous `R` model.
+## Predicting at new points
```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
+MS = ModelSpec(['Price', 'Income']).fit(Carseats)
+X = MS.transform(Carseats)
+Y = Carseats['Sales']
+M_ols = sm.OLS(Y, X).fit()
+M_ols.params
```
-We note a few important things:
-
-1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these
-columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**
-
-2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**
-
-A few notes:
-
-- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**
-
-- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily.
-
-```{code-cell} ipython3
-full_encoding = contrast('UIncome', None)
-design.build_columns(Carseats, full_encoding)
-```
+As `ModelSpec` is a transformer, it can be evaluated at new feature values.
+Constructing the design matrix at any values is carried out by the `transform` method.
```{code-cell} ipython3
-design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
+new_data = pd.DataFrame({'Price':[40, 50], 'Income':[10, 20]})
+new_X = MS.transform(new_data)
+M_ols.get_prediction(new_X).predicted_mean
```
-## Special encodings
-
-For flexible models, we may want to consider transformations of features, i.e. polynomial
-or spline transformations. Given transforms that follow the `fit/transform` paradigm
-we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`
-package includes a `Poly` transform
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import poly
-poly('Income', 3)
-```
+## Using `np.ndarray`
-```{code-cell} ipython3
-design = ModelSpec([poly('Income', 3), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
+As the basic model is to concatenate columns extracted from a columnar data
+representation, one *can* use `np.ndarray` as the column data. In this case,
+columns will be selected by integer indices.
-Compare:
+### Caveats using `np.ndarray`
-```{code-cell} ipython3
-%%R
-lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef
-```
+If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.
+However,
+unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.
-## Splines
+However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,
+in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning.
-Support for natural and B-splines is also included
+We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new
+values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties
+in using `np.ndarray`.
-```{code-cell} ipython3
-from ISLP.models.model_spec import ns, bs, pca
-design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
++++
-```{code-cell} ipython3
-%%R
-library(splines)
-lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef
-```
+We will refit this model, using `ModelSpec` with an `np.ndarray` instead
```{code-cell} ipython3
-design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
+Carseats_np = np.asarray(Carseats[['Price', 'Education', 'Income']])
+MS_np = ModelSpec([0,2]).fit(Carseats_np)
+MS_np.transform(Carseats_np)
```
```{code-cell} ipython3
-%%R
-lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef
+M_ols_np = sm.OLS(Y, MS_np.transform(Carseats_np)).fit()
+M_ols_np.params
```
-## PCA
+Now, let's consider finding the design matrix at new points.
+When using `pd.DataFrame` we only need to supply the `transform` method
+a data frame with columns implicated in the `terms` argument (in this case, `Price` and `Income`).
-```{code-cell} ipython3
-design = ModelSpec([pca(['Income',
- 'Price',
- 'Advertising',
- 'Population'],
- n_components=2,
- name='myvars'), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
+However, when using `np.ndarray` with integers as indices, `Price` was column 0 and `Income` was column 2. The only
+sensible way to produce a return for predict is to extract its 0th and 2nd columns. Note this means
+that the meaning of columns in an `np.ndarray` provided to `transform` essentially must be identical to those
+passed to `fit`.
```{code-cell} ipython3
-%%R
-lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)
+try:
+ new_D = np.array([[40,50], [10,20]]).T
+ new_X = MS_np.transform(new_D)
+except IndexError as e:
+ print(e)
```
-It is of course common to scale before running PCA.
+Ultimately, `M` expects 3 columns for new predictions because it was fit
+with a matrix having 3 columns (the first representing an intercept).
-```{code-cell} ipython3
-design = ModelSpec([pca(['Income',
- 'Price',
- 'Advertising',
- 'Population'],
- n_components=2,
- name='myvars',
- scale=True), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
+We might be tempted to try as with the `pd.DataFrame` and produce
+an `np.ndarray` with only the necessary variables.
```{code-cell} ipython3
-%%R
-lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)
+new_D = np.array([[40,50], [np.nan, np.nan], [10,20]]).T
+new_X = MS_np.transform(new_D)
+print(new_X)
+M_ols.get_prediction(new_X).predicted_mean
```
-There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead
-of `np.std(ddof=1)`.
-
-```{code-cell} ipython3
-np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))
-```
+For more complicated design contructions ensuring the columns of `new_D` match that of the original data will be more cumbersome. We expect
+then that `pd.DataFrame` (or a columnar data representation with similar API) will likely be easier to use with `ModelSpec`.
diff --git a/docs/jupyterbook/models/submodels.ipynb b/docs/jupyterbook/models/submodels.ipynb
deleted file mode 100644
index 777037a..0000000
--- a/docs/jupyterbook/models/submodels.ipynb
+++ /dev/null
@@ -1,3127 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "ee33d364",
- "metadata": {},
- "source": [
- "# Building design matrices with `ModelSpec`\n",
- "\n",
- "Force rebuild"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "4c70fbaa",
- "metadata": {},
- "outputs": [],
- "source": [
- "x=4\n",
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
- "\n",
- "from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
- "\n",
- "import statsmodels.api as sm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "8a708215",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n",
- " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
- "Carseats.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dad5e991",
- "metadata": {},
- "source": [
- "## Let's break up income into groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "ac7086a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "261446c8",
- "metadata": {},
- "source": [
- "Let's also create an unordered version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "674bb806",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8f030039",
- "metadata": {},
- "source": [
- "## A simple model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "40cd6c28",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "e65f5607",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "29d9b55f",
- "metadata": {},
- "source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "cfbe5b92",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7092f666",
- "metadata": {},
- "source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "e2d43844",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "46a01612",
- "metadata": {},
- "source": [
- "It recognized ordinal columns as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "465a9326",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['OIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "76f8480d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "25fcc1de",
- "metadata": {},
- "source": [
- "## Encoding a column\n",
- "\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "dfe6cc35",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "8fc9779a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8e04da60",
- "metadata": {},
- "source": [
- "## The terms\n",
- "\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "c579dbce",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Price', 'Income']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "4587b8bd",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2595f0fa",
- "metadata": {},
- "source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "03bd9366",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "de04ca48",
- "metadata": {},
- "source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "a42af4c5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b146d0c0",
- "metadata": {},
- "source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "b6c394a6",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3bb30a3f",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "ea7770ff",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b2b4a01a",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "21ad8b44",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2262377d",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "1654ca47",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1db0e0a9",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "d20e8ea8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "450fe910",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " Price | \n",
- " Income | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0705ba6f",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "866c2863",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "f2021166",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20e1a31a",
- "metadata": {},
- "source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "a5926ec9",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "997a63cb",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "40410c48",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "920203e9",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "1061da77",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c6bfe001",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "5ae6d25f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "edd7ebeb",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "9455e532",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fd726791",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "967d9ebc",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "d0429b56",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "415e3fd0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "8a99c3a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "9250a28a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fe90c12c",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "0546ec84",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6ec4fe65",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "61e7f56e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "802ed854",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "82d7a01d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "e26849a1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "2fc4cd8c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "49e33d41",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "ce018fdf",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2d42b822",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "fbb3e3ba",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f9a7d4ad",
- "metadata": {},
- "source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "5a6f8e69",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "98eef5c8",
- "metadata": {},
- "source": [
- "Comparing this to the previous `R` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "58c99601",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9c979d7e",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "0cb3b63a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "272098d7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "dtype: float64"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fe05c471",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "67062299",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "df5e5b4d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3, )[0] 10.036373\n",
- "poly(Income, 3, )[1] -2.799156\n",
- "poly(Income, 3, )[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "01be9c13",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "3244d6f6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8ad5bb1d",
- "metadata": {},
- "source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "6a6f4358",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 4.240421\n",
- "ns(Income, , df=5)[0] 1.468196\n",
- "ns(Income, , df=5)[1] 1.499471\n",
- "ns(Income, , df=5)[2] 1.152070\n",
- "ns(Income, , df=5)[3] 2.418398\n",
- "ns(Income, , df=5)[4] 1.804460\n",
- "ShelveLoc[Good] 4.810449\n",
- "ShelveLoc[Medium] 1.881095\n",
- "dtype: float64"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import ns, bs, pca\n",
- "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "fb740953",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n",
- " 4.240421 1.468196 1.499471 1.152070 \n",
- "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n",
- " 2.418398 1.804460 4.810449 1.881095 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "library(splines)\n",
- "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "fe1bf7fe",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 3.495085\n",
- "bs(Income, , df=7, degree=2)[0] 1.813118\n",
- "bs(Income, , df=7, degree=2)[1] 0.961852\n",
- "bs(Income, , df=7, degree=2)[2] 2.471545\n",
- "bs(Income, , df=7, degree=2)[3] 2.158891\n",
- "bs(Income, , df=7, degree=2)[4] 2.091625\n",
- "bs(Income, , df=7, degree=2)[5] 2.600669\n",
- "bs(Income, , df=7, degree=2)[6] 2.843108\n",
- "ShelveLoc[Good] 4.804919\n",
- "ShelveLoc[Medium] 1.880337\n",
- "dtype: float64"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "86e966e0",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) bs(Income, df = 7, degree = 2)1 \n",
- " 3.4950851 1.8131176 \n",
- "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n",
- " 0.9618523 2.4715450 \n",
- "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n",
- " 2.1588908 2.0916252 \n",
- "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n",
- " 2.6006694 2.8431084 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.8049190 1.8803375 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "877d4784",
- "metadata": {},
- "source": [
- "## PCA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "8ba6cb20",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.419405\n",
- "pca(myvars, , n_components=2)[0] -0.001131\n",
- "pca(myvars, , n_components=2)[1] -0.024217\n",
- "ShelveLoc[Good] 4.816253\n",
- "ShelveLoc[Medium] 1.924139\n",
- "dtype: float64"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars'), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "f0319e51",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.419405 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n",
- " 0.001131 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n",
- " -0.024217 \n",
- " ShelveLocGood \n",
- " 4.816253 \n",
- " ShelveLocMedium \n",
- " 1.924139 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1f55086a",
- "metadata": {},
- "source": [
- "It is of course common to scale before running PCA."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "id": "bbe9e004",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.352159\n",
- "pca(myvars, , n_components=2)[0] 0.446383\n",
- "pca(myvars, , n_components=2)[1] -1.219788\n",
- "ShelveLoc[Good] 4.922780\n",
- "ShelveLoc[Medium] 2.005617\n",
- "dtype: float64"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars',\n",
- " scale=True), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "id": "d78c02e4",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.3522 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n",
- " 0.4469 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n",
- " -1.2213 \n",
- " ShelveLocGood \n",
- " 4.9228 \n",
- " ShelveLocMedium \n",
- " 2.0056 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8a03c603",
- "metadata": {},
- "source": [
- "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n",
- "of `np.std(ddof=1)`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "f8215cef",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.44694166, -1.22131519])"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a15d0ead",
- "metadata": {},
- "source": [
- "## Submodels\n",
- "\n",
- "We can build submodels as well, even if the terms do not appear in the original `terms` argument.\n",
- "Fundamentally, the terms just need to be able to have the `design.build_columns` work for us to be\n",
- "able to build a design matrix. The initial inspection of the columns of `Carseats` has created\n",
- "a column for `US`, hence we can build this submodel."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "id": "d58c6244",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " US[Yes] | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " intercept US[Yes]\n",
- "0 1.0 1.0\n",
- "1 1.0 1.0\n",
- "2 1.0 1.0\n",
- "3 1.0 1.0\n",
- "4 1.0 0.0\n",
- ".. ... ...\n",
- "395 1.0 1.0\n",
- "396 1.0 1.0\n",
- "397 1.0 1.0\n",
- "398 1.0 1.0\n",
- "399 1.0 1.0\n",
- "\n",
- "[400 rows x 2 columns]"
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['UIncome', 'ShelveLoc', 'Price']).fit(Carseats)\n",
- "design.build_submodel(Carseats, ['US'])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9365ba27",
- "metadata": {},
- "source": [
- "## ANOVA \n",
- "\n",
- "For a given `terms` argument, there as a natural sequence of models, namely those specified by `[terms[:i] for i in range(len(terms)+1]`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "id": "332ab454",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['intercept'], dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]'], dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price'], dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n",
- " 'UIncome[L]', 'UIncome[M]'],\n",
- " dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n",
- " 'UIncome[L]', 'UIncome[M]', 'US[Yes]'],\n",
- " dtype='object')\n"
- ]
- }
- ],
- "source": [
- "design = ModelSpec(['ShelveLoc', 'Price', 'UIncome', 'US']).fit(Carseats)\n",
- "for D in design.build_sequence(Carseats):\n",
- " print(D.columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "id": "f6cfd031",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 397.0 | \n",
- " 2172.743555 | \n",
- " 2.0 | \n",
- " 1009.531143 | \n",
- " 153.010858 | \n",
- " 5.452815e-50 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 396.0 | \n",
- " 1455.640702 | \n",
- " 1.0 | \n",
- " 717.102853 | \n",
- " 217.377192 | \n",
- " 1.583751e-39 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 394.0 | \n",
- " 1378.915938 | \n",
- " 2.0 | \n",
- " 76.724764 | \n",
- " 11.628885 | \n",
- " 1.239031e-05 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 393.0 | \n",
- " 1296.462700 | \n",
- " 1.0 | \n",
- " 82.453238 | \n",
- " 24.994257 | \n",
- " 8.678832e-07 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 397.0 2172.743555 2.0 1009.531143 153.010858 5.452815e-50\n",
- "2 396.0 1455.640702 1.0 717.102853 217.377192 1.583751e-39\n",
- "3 394.0 1378.915938 2.0 76.724764 11.628885 1.239031e-05\n",
- "4 393.0 1296.462700 1.0 82.453238 24.994257 8.678832e-07"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "11c4aee8",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "ShelveLoc 2 1009.53 504.77 153.011 < 2.2e-16 ***\n",
- "Price 1 717.10 717.10 217.377 < 2.2e-16 ***\n",
- "UIncome 2 76.72 38.36 11.629 1.240e-05 ***\n",
- "US 1 82.45 82.45 24.994 8.679e-07 ***\n",
- "Residuals 393 1296.46 3.30 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ ShelveLoc + Price + UIncome + US, data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9a4e6e63",
- "metadata": {},
- "source": [
- "Recall that `ModelSpec` does not inspect `terms` to reorder based on degree of \n",
- "interaction as `R` does:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "id": "6e7bf361",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 393.0 | \n",
- " 2059.376413 | \n",
- " 6.0 | \n",
- " 1122.898284 | \n",
- " 35.940047 | \n",
- " 1.175738e-34 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 391.0 | \n",
- " 2036.044596 | \n",
- " 2.0 | \n",
- " 23.331817 | \n",
- " 2.240310 | \n",
- " 1.077900e-01 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 393.0 2059.376413 6.0 1122.898284 35.940047 1.175738e-34\n",
- "2 391.0 2036.044596 2.0 23.331817 2.240310 1.077900e-01"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([(full_encoding, 'ShelveLoc'), pref_encoding]).fit(Carseats)\n",
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "id": "ed7d4bfa",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "UIncome 2 61.92 30.962 5.9458 0.002859 ** \n",
- "UIncome:ShelveLoc 6 1084.31 180.718 34.7049 < 2.2e-16 ***\n",
- "Residuals 391 2036.04 5.207 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0350da34",
- "metadata": {},
- "source": [
- "To agree with `R` we must order `terms` as `R` will."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "id": "5ddaf87c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 397.0 | \n",
- " 3120.351382 | \n",
- " 2.0 | \n",
- " 61.923316 | \n",
- " 5.945846 | \n",
- " 2.855424e-03 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 391.0 | \n",
- " 2036.044596 | \n",
- " 6.0 | \n",
- " 1084.306785 | \n",
- " 34.704868 | \n",
- " 1.346561e-33 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 397.0 3120.351382 2.0 61.923316 5.945846 2.855424e-03\n",
- "2 391.0 2036.044596 6.0 1084.306785 34.704868 1.346561e-33"
- ]
- },
- "execution_count": 64,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]).fit(Carseats)\n",
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1ef70ce3",
- "metadata": {},
- "source": [
- "## More complicated interactions\n",
- "\n",
- "Can we have an interaction of a polynomial effect with a categorical? Absolutely"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "id": "a1a14742",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "UIncome 2 61.92 30.9617 4.0310 0.01851 *\n",
- "UIncome:poly(Income, 3) 9 79.72 8.8581 1.1533 0.32408 \n",
- "UIncome:US 3 83.51 27.8367 3.6242 0.01324 *\n",
- "Residuals 385 2957.12 7.6808 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ UIncome + poly(Income, 3):UIncome + UIncome:US, data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a909be1a",
- "metadata": {},
- "source": [
- "To match `R` we note that it has used its inspection rules to encode `UIncome` with 3 levels\n",
- "for the two interactions."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "id": "ae286cf3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 65.978856\n",
- "UIncome[M] -60.159607\n",
- "UIncome[H] -147.276154\n",
- "poly(Income, 3, )[0]:UIncome[H] 1957.694387\n",
- "poly(Income, 3, )[0]:UIncome[L] 1462.060650\n",
- "poly(Income, 3, )[0]:UIncome[M] 83.035153\n",
- "poly(Income, 3, )[1]:UIncome[H] -984.494570\n",
- "poly(Income, 3, )[1]:UIncome[L] 881.537647\n",
- "poly(Income, 3, )[1]:UIncome[M] -18.006234\n",
- "poly(Income, 3, )[2]:UIncome[H] 207.614692\n",
- "poly(Income, 3, )[2]:UIncome[L] 217.190749\n",
- "poly(Income, 3, )[2]:UIncome[M] 34.065434\n",
- "UIncome[H]:US 0.903404\n",
- "UIncome[L]:US 0.895538\n",
- "UIncome[M]:US 1.048728\n",
- "dtype: float64"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "p3 = poly('Income', 3)\n",
- "design = ModelSpec([pref_encoding, (p3, full_encoding), (full_encoding, 'US')]).fit(Carseats)\n",
- "X = design.transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "id": "236ab2d2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 397.0 | \n",
- " 3120.351382 | \n",
- " 2.0 | \n",
- " 61.923316 | \n",
- " 4.031032 | \n",
- " 0.018488 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 388.0 | \n",
- " 3040.628559 | \n",
- " 9.0 | \n",
- " 79.722823 | \n",
- " 1.153273 | \n",
- " 0.324049 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 385.0 | \n",
- " 2957.118444 | \n",
- " 3.0 | \n",
- " 83.510115 | \n",
- " 3.624181 | \n",
- " 0.013244 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 397.0 3120.351382 2.0 61.923316 4.031032 0.018488\n",
- "2 388.0 3040.628559 9.0 79.722823 1.153273 0.324049\n",
- "3 385.0 2957.118444 3.0 83.510115 3.624181 0.013244"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0a45c720",
- "metadata": {},
- "source": [
- "## Grouping columns for ANOVA\n",
- "\n",
- "The `Variable` construct can be used to group\n",
- "variables together to get custom sequences of models for `anova_lm`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "id": "f36c1b3b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['intercept'], dtype='object')\n",
- "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]'], dtype='object')\n",
- "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]', 'US[Yes]',\n",
- " 'Advertising'],\n",
- " dtype='object')\n"
- ]
- }
- ],
- "source": [
- "group1 = Variable(('Price', pref_encoding), 'group1', None)\n",
- "group2 = Variable(('US', 'Advertising'), 'group2', None)\n",
- "design = ModelSpec([group1, group2]).fit(Carseats)\n",
- "for D in design.build_sequence(Carseats):\n",
- " print(D.columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "id": "3daf7638",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 396.0 | \n",
- " 2508.187788 | \n",
- " 3.0 | \n",
- " 674.086910 | \n",
- " 39.304841 | \n",
- " 2.970412e-22 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 394.0 | \n",
- " 2252.396343 | \n",
- " 2.0 | \n",
- " 255.791445 | \n",
- " 22.372135 | \n",
- " 6.267562e-10 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 396.0 2508.187788 3.0 674.086910 39.304841 2.970412e-22\n",
- "2 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10"
- ]
- },
- "execution_count": 69,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "46c1ace8",
- "metadata": {},
- "source": [
- "It is not clear this is simple to do in `R` as the formula object expands all parentheses."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "id": "0b87e430",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "Price 1 630.03 630.03 110.2079 < 2.2e-16 ***\n",
- "UIncome 2 44.06 22.03 3.8533 0.02201 * \n",
- "US 1 121.88 121.88 21.3196 5.270e-06 ***\n",
- "Advertising 1 133.91 133.91 23.4247 1.868e-06 ***\n",
- "Residuals 394 2252.40 5.72 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ (Price + UIncome) + (US + Advertising), data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7c137360",
- "metadata": {},
- "source": [
- "It can be done by building up the models\n",
- "by hand and likely is possible to be done programmatically but it seems not obvious."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "id": "b678d323",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ 1\n",
- "Model 2: Sales ~ Price + UIncome\n",
- "Model 3: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F) \n",
- "1 399 3182.3 \n",
- "2 396 2508.2 3 674.09 39.305 < 2.2e-16 ***\n",
- "3 394 2252.4 2 255.79 22.372 6.268e-10 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "M1 = lm(Sales ~ 1, data=Carseats)\n",
- "M2 = lm(Sales ~ Price + UIncome, data=Carseats)\n",
- "M3 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n",
- "anova(M1, M2, M3)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b0388949",
- "metadata": {},
- "source": [
- "## Alternative anova\n",
- "\n",
- "Another common ANOVA table involves dropping each term in succession from the model and comparing\n",
- "to the full model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "id": "ac5b916a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'intercept'}\n",
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 395.0 4417.273517 0.0 NaN NaN NaN\n",
- "1 394.0 2252.396343 1.0 2164.877175 378.690726 1.359177e-59\n",
- "{'Price', 'UIncome[H]', 'UIncome[M]'}\n",
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 397.0 2950.808154 0.0 NaN NaN NaN\n",
- "1 394.0 2252.396343 3.0 698.411811 40.723184 6.077848e-23\n",
- "{'US[Yes]', 'Advertising'}\n",
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 396.0 2508.187788 0.0 NaN NaN NaN\n",
- "1 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10\n"
- ]
- }
- ],
- "source": [
- "Dfull = design.transform(Carseats)\n",
- "Mfull = sm.OLS(Y, Dfull).fit()\n",
- "for i, D in enumerate(design.build_sequence(Carseats, anova_type='drop')):\n",
- " if i == 0:\n",
- " D0 = D\n",
- " print(set(D.columns) ^ set(Dfull.columns))\n",
- " print(sm.stats.anova_lm(sm.OLS(Y, D).fit(), Mfull))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "id": "a0c71948",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ US + Advertising\n",
- "Model 2: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F) \n",
- "1 397 2950.8 \n",
- "2 394 2252.4 3 698.41 40.723 < 2.2e-16 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ Price + UIncome\n",
- "Model 2: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F) \n",
- "1 396 2508.2 \n",
- "2 394 2252.4 2 255.79 22.372 6.268e-10 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n",
- "M2 = lm(Sales ~ US + Advertising, data=Carseats)\n",
- "print(anova(M2, M1))\n",
- "M3 = lm(Sales ~ Price + UIncome, data=Carseats)\n",
- "print(anova(M3, M1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a5e4880d",
- "metadata": {},
- "source": [
- "The comparison without the intercept here is actually very hard to achieve in `R` with `anova` due to its inspection\n",
- "of the formula."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
- "id": "4b383401",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ Price + UIncome + US + Advertising - 1\n",
- "Model 2: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F)\n",
- "1 394 2252.4 \n",
- "2 394 2252.4 0 9.0949e-13 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n",
- "M4 = lm(Sales ~ Price + UIncome + US + Advertising - 1, data=Carseats)\n",
- "print(anova(M4, M1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "72d7c83b",
- "metadata": {},
- "source": [
- "It can be found with `summary`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "id": "4d5ce789",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ Price + UIncome + US + Advertising, data = Carseats)\n",
- "\n",
- "Residuals:\n",
- " Min 1Q Median 3Q Max \n",
- "-7.4437 -1.6351 -0.0932 1.4920 6.8076 \n",
- "\n",
- "Coefficients:\n",
- " Estimate Std. Error t value Pr(>|t|) \n",
- "(Intercept) 12.520356 0.643390 19.460 < 2e-16 ***\n",
- "Price -0.054000 0.005072 -10.647 < 2e-16 ***\n",
- "UIncomeM 0.548906 0.281693 1.949 0.0521 . \n",
- "UIncomeH 0.708219 0.322028 2.199 0.0284 * \n",
- "USYes 0.024181 0.343246 0.070 0.9439 \n",
- "Advertising 0.119509 0.024692 4.840 1.87e-06 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
- "\n",
- "Residual standard error: 2.391 on 394 degrees of freedom\n",
- "Multiple R-squared: 0.2922,\tAdjusted R-squared: 0.2832 \n",
- "F-statistic: 32.53 on 5 and 394 DF, p-value: < 2.2e-16\n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "summary(M1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "id": "56b82d02",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(378.690726, 378.69160000000005)"
- ]
- },
- "execution_count": 76,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "378.690726, 19.46**2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "872f645c-1d6f-4d08-9eec-2b80276bc82c",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "jupytext": {
- "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
- },
- "kernelspec": {
- "display_name": "islp_test",
- "language": "python",
- "name": "islp_test"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/jupyterbook/models/submodels.md b/docs/jupyterbook/models/submodels.md
deleted file mode 100644
index c2a97fd..0000000
--- a/docs/jupyterbook/models/submodels.md
+++ /dev/null
@@ -1,652 +0,0 @@
----
-jupytext:
- formats: source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb
- text_representation:
- extension: .md
- format_name: myst
- format_version: 0.13
- jupytext_version: 1.14.1
-kernelspec:
- display_name: islp_test
- language: python
- name: islp_test
----
-
-# Building design matrices with `ModelSpec`
-
-Force rebuild
-
-```{code-cell} ipython3
-x=4
-import numpy as np, pandas as pd
-%load_ext rpy2.ipython
-
-from ISLP import load_data
-from ISLP.models import ModelSpec
-
-import statsmodels.api as sm
-```
-
-```{code-cell} ipython3
-Carseats = load_data('Carseats')
-%R -i Carseats
-Carseats.columns
-```
-
-## Let's break up income into groups
-
-```{code-cell} ipython3
-Carseats['OIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'])
-Carseats['OIncome']
-```
-
-Let's also create an unordered version
-
-```{code-cell} ipython3
-Carseats['UIncome'] = pd.cut(Carseats['Income'],
- [0,50,90,200],
- labels=['L','M','H'],
- ordered=False)
-Carseats['UIncome']
-```
-
-## A simple model
-
-```{code-cell} ipython3
-design = ModelSpec(['Price', 'Income'])
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-Y = Carseats['Sales']
-M = sm.OLS(Y, X).fit()
-M.params
-```
-
-## Basic procedure
-
-The design matrix is built by cobbling together a set of columns and possibly transforming them.
-A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`
-is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:
-
-```{code-cell} ipython3
-Carseats['ShelveLoc']
-```
-
-This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods
-`get_columns` and `fit_encoder`.
-
-```{code-cell} ipython3
-design.column_info_['ShelveLoc']
-```
-
-It recognized ordinal columns as well.
-
-```{code-cell} ipython3
-design.column_info_['OIncome']
-```
-
-```{code-cell} ipython3
-income = design.column_info_['Income']
-cols, names = income.get_columns(Carseats)
-(cols[:4], names)
-```
-
-## Encoding a column
-
-In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical
-variables usually are encoded by several columns, typically one less than the number of categories.
-This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform
-model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits
-its encoder the first time data is passed to it.
-
-```{code-cell} ipython3
-shelve = design.column_info_['ShelveLoc']
-cols, names = shelve.get_columns(Carseats)
-(cols[:4], names)
-```
-
-```{code-cell} ipython3
-oincome = design.column_info_['OIncome']
-oincome.get_columns(Carseats)[0][:4]
-```
-
-## The terms
-
-The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through
-the `terms` argument which should be a sequence. The elements of `terms` are often
-going to be strings (or tuples of strings for interactions, see below) but are converted to a
-`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple.
-
-```{code-cell} ipython3
-design.terms
-```
-
-```{code-cell} ipython3
-design.terms_
-```
-
-While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A
-`Variable` can also create columns through the `build_columns` method of `ModelSpec`
-
-```{code-cell} ipython3
-price = design.terms_[0]
-design.build_columns(Carseats, price)
-```
-
-Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The
-tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then
-is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during
-the call to `ModelSpec.fit`.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import Variable
-
-new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)
-design.build_columns(Carseats, new_var)
-```
-
-Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the
-arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`.
-
-```{code-cell} ipython3
-from sklearn.decomposition import PCA
-pca = PCA(n_components=2)
-pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`
-pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)
-design.build_columns(Carseats, pca_var)
-```
-
-The elements of the `variables` attribute may be column identifiers ( `"Price"`), `Column` instances (`price`)
-or `Variable` instances (`pca_var`).
-
-```{code-cell} ipython3
-fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)
-design.build_columns(Carseats, fancy_var)
-```
-
-We can of course run PCA again on these features (if we wanted).
-
-```{code-cell} ipython3
-pca2 = PCA(n_components=2)
-pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`
-pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)
-design.build_columns(Carseats, pca2_var)
-```
-
-## Building the design matrix
-
-With these notions in mind, the final design is essentially then
-
-```{code-cell} ipython3
-X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]
-```
-
-An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is
-a dataframe the index is adjusted accordingly.
-
-```{code-cell} ipython3
-design.intercept
-```
-
-```{code-cell} ipython3
-design.transform(Carseats)[:4]
-```
-
-## Predicting
-
-Constructing the design matrix at any values is carried out by the `transform` method.
-
-```{code-cell} ipython3
-new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})
-new_X = design.transform(new_data)
-M.get_prediction(new_X).predicted_mean
-```
-
-```{code-cell} ipython3
-%%R -i new_data,Carseats
-predict(lm(Sales ~ Price + Income, data=Carseats), new_data)
-```
-
-### Difference between using `pd.DataFrame` and `np.ndarray`
-
-If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.
-
-If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,
-in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning.
-
-```{code-cell} ipython3
-Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])
-design_np = ModelSpec([0,3]).fit(Carseats_np)
-design_np.transform(Carseats_np)[:4]
-```
-
-The following will fail for hopefully obvious reasons
-
-```{code-cell} ipython3
-try:
- new_D = np.zeros((2,2))
- new_D[:,0] = [10,20]
- new_D[:,1] = [40,50]
- M.get_prediction(new_D).predicted_mean
-except ValueError as e:
- print(e)
-```
-
-Ultimately, `M` expects 3 columns for new predictions because it was fit
-with a matrix having 3 columns (the first representing an intercept).
-
-We might be tempted to try as with the `pd.DataFrame` and produce
-an `np.ndarray` with only the necessary variables.
-
-```{code-cell} ipython3
-try:
- new_X = np.zeros((2,2))
- new_X[:,0] = [10,20]
- new_X[:,1] = [40,50]
- new_D = design_np.transform(new_X)
- M.get_prediction(new_D).predicted_mean
-except IndexError as e:
- print(e)
-```
-
-This fails because `design_np` is looking for column `3` from its `terms`:
-
-```{code-cell} ipython3
-design_np.terms_
-```
-
-However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed
-represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:
-
-```{code-cell} ipython3
-new_X = np.zeros((2,4))
-new_X[:,0] = [10,20]
-new_X[:,3] = [40,50]
-new_D = design_np.transform(new_X)
-M.get_prediction(new_D).predicted_mean
-```
-
-Given this subtlety about needing to supply arrays with identical column structure to `transform` when
-using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case.
-
-+++
-
-## A model with some categorical variables
-
-Categorical variables become `Column` instances with encoders.
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)
-design.column_info_['UIncome']
-```
-
-```{code-cell} ipython3
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Getting the encoding you want
-
-By default the level dropped by `ModelSpec` will be the first of the `categories_` values from
-`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems
-as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`
-where `new_encoder` would somehow drop the column we want dropped.
-
-However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:
-
-```{code-cell} ipython3
-design.column_info_['UIncome']
-```
-
-This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when
-we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest
-and we need a way to allow different encodings of the same column of `Carseats`
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
- We can create a new
-`Column` with the encoder we want. For categorical variables, there is a convenience function to do so.
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import contrast
-pref_encoding = contrast('UIncome', 'drop', 'L')
-```
-
-```{code-cell} ipython3
-design.build_columns(Carseats, pref_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)
-X = design.fit_transform(Carseats)
-X.columns
-```
-
-```{code-cell} ipython3
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef
-```
-
-## Interactions
-
-We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument
-to `ModelSpec`.
-
-```{code-cell} ipython3
-design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with
-`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`.
-
-```{code-cell} ipython3
-design.terms_[0]
-```
-
-Comparing this to the previous `R` model.
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)
-```
-
-We note a few important things:
-
-1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these
-columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**
-
-2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**
-
-A few notes:
-
-- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**
-
-- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily.
-
-```{code-cell} ipython3
-full_encoding = contrast('UIncome', None)
-design.build_columns(Carseats, full_encoding)
-```
-
-```{code-cell} ipython3
-design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-## Special encodings
-
-For flexible models, we may want to consider transformations of features, i.e. polynomial
-or spline transformations. Given transforms that follow the `fit/transform` paradigm
-we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`
-package includes a `Poly` transform
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import poly
-poly('Income', 3)
-```
-
-```{code-cell} ipython3
-design = ModelSpec([poly('Income', 3), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-Compare:
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef
-```
-
-## Splines
-
-Support for natural and B-splines is also included
-
-```{code-cell} ipython3
-from ISLP.models.model_spec import ns, bs, pca
-design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-library(splines)
-lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef
-```
-
-```{code-cell} ipython3
-design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef
-```
-
-## PCA
-
-```{code-cell} ipython3
-design = ModelSpec([pca(['Income',
- 'Price',
- 'Advertising',
- 'Population'],
- n_components=2,
- name='myvars'), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)
-```
-
-It is of course common to scale before running PCA.
-
-```{code-cell} ipython3
-design = ModelSpec([pca(['Income',
- 'Price',
- 'Advertising',
- 'Population'],
- n_components=2,
- name='myvars',
- scale=True), 'ShelveLoc'])
-X = design.fit_transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-%%R
-lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)
-```
-
-There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead
-of `np.std(ddof=1)`.
-
-```{code-cell} ipython3
-np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))
-```
-
-## Submodels
-
-We can build submodels as well, even if the terms do not appear in the original `terms` argument.
-Fundamentally, the terms just need to be able to have the `design.build_columns` work for us to be
-able to build a design matrix. The initial inspection of the columns of `Carseats` has created
-a column for `US`, hence we can build this submodel.
-
-```{code-cell} ipython3
-design = ModelSpec(['UIncome', 'ShelveLoc', 'Price']).fit(Carseats)
-design.build_submodel(Carseats, ['US'])
-```
-
-## ANOVA
-
-For a given `terms` argument, there as a natural sequence of models, namely those specified by `[terms[:i] for i in range(len(terms)+1]`.
-
-```{code-cell} ipython3
-design = ModelSpec(['ShelveLoc', 'Price', 'UIncome', 'US']).fit(Carseats)
-for D in design.build_sequence(Carseats):
- print(D.columns)
-```
-
-```{code-cell} ipython3
-sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))
-```
-
-```{code-cell} ipython3
-%%R
-anova(lm(Sales ~ ShelveLoc + Price + UIncome + US, data=Carseats))
-```
-
-Recall that `ModelSpec` does not inspect `terms` to reorder based on degree of
-interaction as `R` does:
-
-```{code-cell} ipython3
-design = ModelSpec([(full_encoding, 'ShelveLoc'), pref_encoding]).fit(Carseats)
-sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))
-```
-
-```{code-cell} ipython3
-%%R
-anova(lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats))
-```
-
-To agree with `R` we must order `terms` as `R` will.
-
-```{code-cell} ipython3
-design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]).fit(Carseats)
-sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))
-```
-
-## More complicated interactions
-
-Can we have an interaction of a polynomial effect with a categorical? Absolutely
-
-```{code-cell} ipython3
-%%R
-anova(lm(Sales ~ UIncome + poly(Income, 3):UIncome + UIncome:US, data=Carseats))
-```
-
-To match `R` we note that it has used its inspection rules to encode `UIncome` with 3 levels
-for the two interactions.
-
-```{code-cell} ipython3
-p3 = poly('Income', 3)
-design = ModelSpec([pref_encoding, (p3, full_encoding), (full_encoding, 'US')]).fit(Carseats)
-X = design.transform(Carseats)
-sm.OLS(Y, X).fit().params
-```
-
-```{code-cell} ipython3
-sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))
-```
-
-## Grouping columns for ANOVA
-
-The `Variable` construct can be used to group
-variables together to get custom sequences of models for `anova_lm`.
-
-```{code-cell} ipython3
-group1 = Variable(('Price', pref_encoding), 'group1', None)
-group2 = Variable(('US', 'Advertising'), 'group2', None)
-design = ModelSpec([group1, group2]).fit(Carseats)
-for D in design.build_sequence(Carseats):
- print(D.columns)
-```
-
-```{code-cell} ipython3
-sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))
-```
-
-It is not clear this is simple to do in `R` as the formula object expands all parentheses.
-
-```{code-cell} ipython3
-%%R
-anova(lm(Sales ~ (Price + UIncome) + (US + Advertising), data=Carseats))
-```
-
-It can be done by building up the models
-by hand and likely is possible to be done programmatically but it seems not obvious.
-
-```{code-cell} ipython3
-%%R
-M1 = lm(Sales ~ 1, data=Carseats)
-M2 = lm(Sales ~ Price + UIncome, data=Carseats)
-M3 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)
-anova(M1, M2, M3)
-```
-
-## Alternative anova
-
-Another common ANOVA table involves dropping each term in succession from the model and comparing
-to the full model.
-
-```{code-cell} ipython3
-Dfull = design.transform(Carseats)
-Mfull = sm.OLS(Y, Dfull).fit()
-for i, D in enumerate(design.build_sequence(Carseats, anova_type='drop')):
- if i == 0:
- D0 = D
- print(set(D.columns) ^ set(Dfull.columns))
- print(sm.stats.anova_lm(sm.OLS(Y, D).fit(), Mfull))
-```
-
-```{code-cell} ipython3
-%%R
-M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)
-M2 = lm(Sales ~ US + Advertising, data=Carseats)
-print(anova(M2, M1))
-M3 = lm(Sales ~ Price + UIncome, data=Carseats)
-print(anova(M3, M1))
-```
-
-The comparison without the intercept here is actually very hard to achieve in `R` with `anova` due to its inspection
-of the formula.
-
-```{code-cell} ipython3
-%%R
-M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)
-M4 = lm(Sales ~ Price + UIncome + US + Advertising - 1, data=Carseats)
-print(anova(M4, M1))
-```
-
-It can be found with `summary`.
-
-```{code-cell} ipython3
-%%R
-summary(M1)
-```
-
-```{code-cell} ipython3
-378.690726, 19.46**2
-```
-
-```{code-cell} ipython3
-
-```
diff --git a/docs/jupyterbook/transforms/PCA.ipynb b/docs/jupyterbook/transforms/PCA.ipynb
index d8b41f3..ec1e0ae 100644
--- a/docs/jupyterbook/transforms/PCA.ipynb
+++ b/docs/jupyterbook/transforms/PCA.ipynb
@@ -19,9 +19,14 @@
"outputs": [],
"source": [
"import numpy as np\n",
+ "from sklearn.decomposition import PCA\n",
+ "\n",
"from ISLP import load_data\n",
- "from ISLP.models import ModelSpec, pca, Variable, derived_variable\n",
- "from sklearn.decomposition import PCA"
+ "from ISLP.models import (ModelSpec, \n",
+ " pca, \n",
+ " Feature, \n",
+ " derived_feature,\n",
+ " build_columns)"
]
},
{
@@ -71,7 +76,7 @@
"id": "fff603bf",
"metadata": {},
"source": [
- "Suppose we want to make a `Variable` representing the first 3 principal components of the\n",
+ "Suppose we want to make a `Feature` representing the first 3 principal components of the\n",
" features `['CompPrice', 'Income', 'Advertising', 'Population', 'Price']`."
]
},
@@ -80,8 +85,8 @@
"id": "eab49ad1-3957-478f-8a76-28a8f58551e9",
"metadata": {},
"source": [
- "We first make a `Variable` that represents these five features columns, then `pca`\n",
- "can be used to compute a new `Variable` that returns the first three principal components."
+ "We first make a `Feature` that represents these five features columns, then `pca`\n",
+ "can be used to compute a new `Feature` that returns the first three principal components."
]
},
{
@@ -91,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
- "grouped = Variable(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n",
+ "grouped = Feature(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n",
"sklearn_pca = PCA(n_components=3, whiten=True)"
]
},
@@ -100,7 +105,7 @@
"id": "b45655a3-393d-4b4c-b754-cda61ed0e014",
"metadata": {},
"source": [
- "We can now fit `sklearn_pca` and create our new variable."
+ "We can now fit `sklearn_pca` and create our new feature."
]
},
{
@@ -108,175 +113,18 @@
"execution_count": 5,
"id": "6cfe8861-ad07-47b9-95d1-5d5513ff6fbe",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "sklearn_pca.fit(design.build_columns(Carseats, grouped)[0]) \n",
- "pca_var = derived_variable(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n",
+ "grouped_features = build_columns(design.column_info_,\n",
+ " Carseats,\n",
+ " grouped)[0]\n",
+ "sklearn_pca.fit(grouped_features) \n",
+ "pca_var = derived_feature(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n",
" name='pca(grouped)', encoder=sklearn_pca)\n",
- "derived_features, _ = design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "aeb47184-9e15-4a6e-b60a-916f5ff89063",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " CompPrice | \n",
- " Income | \n",
- " Advertising | \n",
- " Population | \n",
- " Price | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 138 | \n",
- " 73 | \n",
- " 11 | \n",
- " 276 | \n",
- " 120 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 111 | \n",
- " 48 | \n",
- " 16 | \n",
- " 260 | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 113 | \n",
- " 35 | \n",
- " 10 | \n",
- " 269 | \n",
- " 80 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 117 | \n",
- " 100 | \n",
- " 4 | \n",
- " 466 | \n",
- " 97 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 141 | \n",
- " 64 | \n",
- " 3 | \n",
- " 340 | \n",
- " 128 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 138 | \n",
- " 108 | \n",
- " 17 | \n",
- " 203 | \n",
- " 128 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 139 | \n",
- " 23 | \n",
- " 3 | \n",
- " 37 | \n",
- " 120 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 162 | \n",
- " 26 | \n",
- " 12 | \n",
- " 368 | \n",
- " 159 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 100 | \n",
- " 79 | \n",
- " 7 | \n",
- " 284 | \n",
- " 95 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 134 | \n",
- " 37 | \n",
- " 0 | \n",
- " 27 | \n",
- " 120 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 5 columns
\n",
- "
"
- ],
- "text/plain": [
- " CompPrice Income Advertising Population Price\n",
- "0 138 73 11 276 120\n",
- "1 111 48 16 260 83\n",
- "2 113 35 10 269 80\n",
- "3 117 100 4 466 97\n",
- "4 141 64 3 340 128\n",
- ".. ... ... ... ... ...\n",
- "395 138 108 17 203 128\n",
- "396 139 23 3 37 120\n",
- "397 162 26 12 368 159\n",
- "398 100 79 7 284 95\n",
- "399 134 37 0 27 120\n",
- "\n",
- "[400 rows x 5 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, grouped)[0]"
+ "derived_features, _ = build_columns(design.column_info_,\n",
+ " Carseats, \n",
+ " pca_var,\n",
+ " encoders=design.encoders_)"
]
},
{
@@ -291,7 +139,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "9f4b0955",
"metadata": {},
"outputs": [],
@@ -304,22 +152,10 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"id": "6b382699-eb86-457f-8e91-09a63eb21d49",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
{
"data": {
"text/plain": [
@@ -329,7 +165,7 @@
" dtype='object')"
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -350,7 +186,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"id": "4a8d9b28",
"metadata": {},
"outputs": [],
@@ -361,7 +197,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 13,
"id": "6efa6c67-86e1-4f51-86c2-25c838a90bf4",
"metadata": {},
"outputs": [
@@ -371,7 +207,7 @@
"(4.073428490498941e-14, 0.0)"
]
},
- "execution_count": 10,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -386,9 +222,9 @@
"formats": "source/transforms///ipynb,jupyterbook/transforms///md:myst,jupyterbook/transforms///ipynb"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
diff --git a/docs/jupyterbook/transforms/PCA.md b/docs/jupyterbook/transforms/PCA.md
index b9ba769..6b1a77f 100644
--- a/docs/jupyterbook/transforms/PCA.md
+++ b/docs/jupyterbook/transforms/PCA.md
@@ -5,11 +5,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Derived features: using PCA on a subset of columns
@@ -19,9 +19,14 @@ construction of transformers applied to features.
```{code-cell} ipython3
import numpy as np
-from ISLP import load_data
-from ISLP.models import ModelSpec, pca, Variable, derived_variable
from sklearn.decomposition import PCA
+
+from ISLP import load_data
+from ISLP.models import (ModelSpec,
+ pca,
+ Feature,
+ derived_feature,
+ build_columns)
```
```{code-cell} ipython3
@@ -35,30 +40,32 @@ Let's create a `ModelSpec` that is aware of all of the relevant columns.
design = ModelSpec(Carseats.columns.drop(['Sales'])).fit(Carseats)
```
-Suppose we want to make a `Variable` representing the first 3 principal components of the
+Suppose we want to make a `Feature` representing the first 3 principal components of the
features `['CompPrice', 'Income', 'Advertising', 'Population', 'Price']`.
+++
-We first make a `Variable` that represents these five features columns, then `pca`
-can be used to compute a new `Variable` that returns the first three principal components.
+We first make a `Feature` that represents these five features columns, then `pca`
+can be used to compute a new `Feature` that returns the first three principal components.
```{code-cell} ipython3
-grouped = Variable(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)
+grouped = Feature(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)
sklearn_pca = PCA(n_components=3, whiten=True)
```
-We can now fit `sklearn_pca` and create our new variable.
+We can now fit `sklearn_pca` and create our new feature.
```{code-cell} ipython3
-sklearn_pca.fit(design.build_columns(Carseats, grouped)[0])
-pca_var = derived_variable(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],
+grouped_features = build_columns(design.column_info_,
+ Carseats,
+ grouped)[0]
+sklearn_pca.fit(grouped_features)
+pca_var = derived_feature(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],
name='pca(grouped)', encoder=sklearn_pca)
-derived_features, _ = design.build_columns(Carseats, pca_var)
-```
-
-```{code-cell} ipython3
-design.build_columns(Carseats, grouped)[0]
+derived_features, _ = build_columns(design.column_info_,
+ Carseats,
+ pca_var,
+ encoders=design.encoders_)
```
## Helper function
diff --git a/docs/jupyterbook/transforms/poly.ipynb b/docs/jupyterbook/transforms/poly.ipynb
index 54d7b4e..45c862e 100644
--- a/docs/jupyterbook/transforms/poly.ipynb
+++ b/docs/jupyterbook/transforms/poly.ipynb
@@ -168,7 +168,7 @@
"source": [
"## Underlying model\n",
"\n",
- "If we look at `quartic`, we see it is a `Variable`, i.e. it can be used to produce a set of columns\n",
+ "If we look at `quartic`, we see it is a `Feature`, i.e. it can be used to produce a set of columns\n",
"in a design matrix when it is a term used in creating the `ModelSpec`.\n",
"\n",
"Its encoder is `Poly(degree=4)`. This is a special `sklearn` transform that expects a single column\n",
@@ -319,9 +319,9 @@
"formats": "source/transforms///ipynb,jupyterbook/transforms///md:myst,jupyterbook/transforms///ipynb"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
diff --git a/docs/jupyterbook/transforms/poly.md b/docs/jupyterbook/transforms/poly.md
index 45e0e3d..e5aef11 100644
--- a/docs/jupyterbook/transforms/poly.md
+++ b/docs/jupyterbook/transforms/poly.md
@@ -5,11 +5,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Polynomial features
@@ -66,7 +66,7 @@ np.linalg.norm(ISLP_features - R_features)
## Underlying model
-If we look at `quartic`, we see it is a `Variable`, i.e. it can be used to produce a set of columns
+If we look at `quartic`, we see it is a `Feature`, i.e. it can be used to produce a set of columns
in a design matrix when it is a term used in creating the `ModelSpec`.
Its encoder is `Poly(degree=4)`. This is a special `sklearn` transform that expects a single column
diff --git a/docs/jupyterbook/transforms/splines.ipynb b/docs/jupyterbook/transforms/splines.ipynb
index f28d786..399b0be 100644
--- a/docs/jupyterbook/transforms/splines.ipynb
+++ b/docs/jupyterbook/transforms/splines.ipynb
@@ -310,9 +310,9 @@
"formats": "source/transforms///ipynb,jupyterbook/transforms///md:myst,jupyterbook/transforms///ipynb"
},
"kernelspec": {
- "display_name": "islp_test",
+ "display_name": "python3",
"language": "python",
- "name": "islp_test"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
diff --git a/docs/jupyterbook/transforms/splines.md b/docs/jupyterbook/transforms/splines.md
index f14bc17..de0ee3d 100644
--- a/docs/jupyterbook/transforms/splines.md
+++ b/docs/jupyterbook/transforms/splines.md
@@ -5,11 +5,11 @@ jupytext:
extension: .md
format_name: myst
format_version: 0.13
- jupytext_version: 1.14.1
+ jupytext_version: 1.14.5
kernelspec:
- display_name: islp_test
+ display_name: python3
language: python
- name: islp_test
+ name: python3
---
# Spline features
diff --git a/docs/make_notebooks.py b/docs/make_notebooks.py
new file mode 100644
index 0000000..cfea244
--- /dev/null
+++ b/docs/make_notebooks.py
@@ -0,0 +1,107 @@
+'''
+Run notebooks in an isolated environment specified by a requirements.txt file
+'''
+
+from hashlib import md5
+import tempfile
+import os
+from argparse import ArgumentParser
+
+
+parser = ArgumentParser()
+parser.add_argument('--requirements',
+ default='requirements.txt')
+parser.add_argument('labs',
+ metavar='N',
+ type=str,
+ nargs='+')
+parser.add_argument('--python',
+ default='3.10')
+parser.add_argument('--tarball',
+ default=None,
+ dest='tarball')
+parser.add_argument('--inplace',
+ default=False,
+ action='store_true',
+ help='run notebooks in place?')
+parser.add_argument('--timeout',
+ default=5000,
+ help='preprocessor timeout')
+parser.add_argument('--env_tag',
+ default='')
+
+def make_notebooks(requirements='requirements.txt',
+ srcs=[],
+ dests=[],
+ tarball='',
+ inplace=False,
+ tmpdir='',
+ python='3.10',
+ timeout=5000, # should be enough for Ch10
+ env_tag='',
+ ):
+
+ if tarball and inplace:
+ raise ValueError('tarball option expects notebooks in a tmpdir, while inplace does not copy to a tmpdir')
+
+ md5_ = md5()
+ md5_.update(open(requirements, 'rb').read());
+ hash_ = md5_.hexdigest()[:8]
+
+ env_name = f'isolated_env_{hash_}' + env_tag
+
+ setup_cmd = f'''
+ conda create -n {env_name} python={python} -y;
+ conda run -n {env_name} pip install -r {requirements} jupyter jupytext;
+ '''
+
+ print(setup_cmd)
+ os.system(setup_cmd)
+
+ # may need to up "ulimit -n 4096"
+ archive_files = []
+ for src_, dest_ in zip(srcs, dests):
+ if src_ != dest_:
+ os.system(f'cp {src_} {dest_}')
+ name = os.path.split(dest_)[1]
+ build_cmd = f'''conda run -n {env_name} jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout={timeout} {dest_} '''
+ if '02' in name:
+ build_cmd += ' --allow-errors '
+
+ print(build_cmd)
+ os.system(build_cmd)
+ archive_files.append(name)
+
+ archive_files = ' '.join(archive_files)
+
+ if tarball:
+ tarball = os.path.abspath(tarball)
+ tarball_cmd = f'''
+ cd {tmpdir}; tar -cvzf {tarball} {archive_files}
+ '''
+ print(tarball_cmd)
+ os.system(tarball_cmd)
+
+ os.system(f'conda env remove -n {env_name}')
+
+if __name__ == '__main__':
+
+ args = parser.parse_args()
+ srcs = [os.path.abspath(l) for l in args.labs]
+
+ tmpdir = tempfile.mkdtemp()
+
+ if args.inplace:
+ dests = srcs
+ else:
+ dests = [os.path.join(tmpdir, os.path.split(l)[1]) for l in args.labs]
+
+ make_notebooks(requirements=os.path.abspath(args.requirements),
+ srcs=srcs,
+ dests=dests,
+ inplace=args.inplace,
+ tmpdir=tmpdir,
+ python=args.python,
+ tarball=args.tarball,
+ timeout=args.timeout,
+ env_tag=args.env_tag)
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 68ef4bc..10bce0e 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,7 @@
texext
numpydoc
myst_nb
+sphinx-book-theme
+rpy2
+sphinx_rtd_theme
+jupytext
diff --git a/docs/source/.ipynb_checkpoints/imdb-checkpoint.ipynb b/docs/source/.ipynb_checkpoints/imdb-checkpoint.ipynb
deleted file mode 100644
index c78ca44..0000000
--- a/docs/source/.ipynb_checkpoints/imdb-checkpoint.ipynb
+++ /dev/null
@@ -1,271 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "3eff5ba8",
- "metadata": {},
- "source": [
- "# Creating a clean IMDB dataset\n",
- "\n",
- "Running this example requires `keras`. Use `pip install keras` to install if necessary."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "53925437",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pickle"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "a855c7c0",
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "from scipy.sparse import coo_matrix, save_npz\n",
- "import torch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "fe16fa84",
- "metadata": {},
- "outputs": [],
- "source": [
- "from keras.datasets import imdb\n",
- "from tensorflow.keras.preprocessing.sequence import pad_sequences\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "0369a36a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# the 3 is for three terms: \n",
- "num_words = 10000+3\n",
- "((S_train, Y_train), \n",
- " (S_test, Y_test)) = imdb.load_data(num_words=num_words)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "9e84d7e3",
- "metadata": {},
- "outputs": [],
- "source": [
- "Y_train = Y_train.astype(np.float32)\n",
- "Y_test = Y_test.astype(np.float32)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "1a737737",
- "metadata": {},
- "outputs": [],
- "source": [
- "def one_hot(sequences, ncol):\n",
- " idx, vals = [], []\n",
- " for i, s in enumerate(sequences):\n",
- " idx.extend({(i,v):1 for v in s}.keys())\n",
- " idx = np.array(idx).T\n",
- " vals = np.ones(idx.shape[1], dtype=np.float32)\n",
- " tens = torch.sparse_coo_tensor(indices=idx,\n",
- " values=vals,\n",
- " size=(len(sequences), ncol))\n",
- " return tens.coalesce()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "f08ad327",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_train, L_train = one_hot(S_train, num_words), Y_train\n",
- "X_test = one_hot(S_test, num_words)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "98481bbb",
- "metadata": {},
- "outputs": [],
- "source": [
- "def convert_sparse_tensor(X):\n",
- " idx = np.asarray(X.indices())\n",
- " vals = np.asarray(X.values())\n",
- " return coo_matrix((vals,\n",
- " (idx[0],\n",
- " idx[1])),\n",
- " shape=X.shape).tocsr()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "5a17bd62",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_train_s = convert_sparse_tensor(X_train)\n",
- "X_test_s = convert_sparse_tensor(X_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "ca57aea4",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_train_d = torch.tensor(X_train_s.todense())\n",
- "X_test_d = torch.tensor(X_test_s.todense())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "3d017780",
- "metadata": {},
- "outputs": [],
- "source": [
- "torch.save(X_train_d, 'IMDB_X_train.tensor')\n",
- "torch.save(X_test_d, 'IMDB_X_test.tensor')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f9bb0163",
- "metadata": {},
- "source": [
- "save the sparse matrices"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "23afd3e5",
- "metadata": {},
- "outputs": [],
- "source": [
- "save_npz('IMDB_X_test.npz', X_test_s)\n",
- "save_npz('IMDB_X_train.npz', X_train_s)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "d33568d1",
- "metadata": {},
- "outputs": [],
- "source": [
- "np.save('IMDB_Y_test.npy', Y_test)\n",
- "np.save('IMDB_Y_train.npy', L_train)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f9110984",
- "metadata": {},
- "source": [
- "save and pickle the word index"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "ff44a0b4",
- "metadata": {},
- "outputs": [],
- "source": [
- "word_index = imdb.get_word_index()\n",
- "lookup = {(i+3):w for w, i in word_index.items()}\n",
- "lookup[0] = \"\"\n",
- "lookup[1] = \"\"\n",
- "lookup[2] = \"\"\n",
- "lookup[4] = \"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "1486c640",
- "metadata": {},
- "outputs": [],
- "source": [
- "pickle.dump(lookup, open('IMDB_word_index.pkl', 'bw'))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "57e606c5",
- "metadata": {},
- "source": [
- "create the padded representations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "3ab7a4ac",
- "metadata": {},
- "outputs": [],
- "source": [
- "(S_train,\n",
- " S_test) = [torch.tensor(pad_sequences(S, maxlen=500, value=0))\n",
- " for S in [S_train,\n",
- " S_test]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "55cb2d49",
- "metadata": {},
- "outputs": [],
- "source": [
- "torch.save(S_train, 'IMDB_S_train.tensor')\n",
- "torch.save(S_test, 'IMDB_S_test.tensor')"
- ]
- }
- ],
- "metadata": {
- "jupytext": {
- "cell_metadata_filter": "-all",
- "formats": "py:percent,ipynb,md:myst",
- "main_language": "python"
- },
- "kernelspec": {
- "display_name": "islp_test",
- "language": "python",
- "name": "islp_test"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/source/api/gen.rst b/docs/source/api/gen.rst
index 2539220..fb3bec5 100644
--- a/docs/source/api/gen.rst
+++ b/docs/source/api/gen.rst
@@ -6,7 +6,6 @@
generated/ISLP.bart.bart
generated/ISLP.bart.likelihood
generated/ISLP.bart.particle_tree
- generated/ISLP.bart.tmpbart
generated/ISLP.bart.tree
generated/ISLP.cluster
generated/ISLP.models
diff --git a/docs/source/api/generated/ISLP.bart.tmpbart.rst b/docs/source/api/generated/ISLP.bart.tmpbart.rst
deleted file mode 100644
index b72117a..0000000
--- a/docs/source/api/generated/ISLP.bart.tmpbart.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-.. AUTO-GENERATED FILE -- DO NOT EDIT!
-
-bart.tmpbart
-============
-
-Module: :mod:`bart.tmpbart`
----------------------------
-Inheritance diagram for ``ISLP.bart.tmpbart``:
-
-.. inheritance-diagram:: ISLP.bart.tmpbart
- :parts: 3
-
-.. automodule:: ISLP.bart.tmpbart
-
-.. currentmodule:: ISLP.bart.tmpbart
-
-Classes
--------
-
-:class:`BART`
-~~~~~~~~~~~~~
-
-
-.. autoclass:: BART
- :members:
- :undoc-members:
- :show-inheritance:
- :inherited-members:
-
- .. automethod:: __init__
-
-:class:`SampleSplittingVariable`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
-.. autoclass:: SampleSplittingVariable
- :members:
- :undoc-members:
- :show-inheritance:
- :inherited-members:
-
- .. automethod:: __init__
diff --git a/docs/source/api/generated/ISLP.models.model_spec.rst b/docs/source/api/generated/ISLP.models.model_spec.rst
index c379253..d457e3a 100644
--- a/docs/source/api/generated/ISLP.models.model_spec.rst
+++ b/docs/source/api/generated/ISLP.models.model_spec.rst
@@ -29,11 +29,11 @@ Classes
.. automethod:: __init__
-:class:`ModelSpec`
-~~~~~~~~~~~~~~~~~~
+:class:`Feature`
+~~~~~~~~~~~~~~~~
-.. autoclass:: ModelSpec
+.. autoclass:: Feature
:members:
:undoc-members:
:show-inheritance:
@@ -41,11 +41,11 @@ Classes
.. automethod:: __init__
-:class:`Variable`
-~~~~~~~~~~~~~~~~~
+:class:`ModelSpec`
+~~~~~~~~~~~~~~~~~~
-.. autoclass:: Variable
+.. autoclass:: ModelSpec
:members:
:undoc-members:
:show-inheritance:
@@ -63,10 +63,13 @@ Functions
.. autofunction:: ISLP.models.model_spec.build_columns
+.. autofunction:: ISLP.models.model_spec.build_model
+
+
.. autofunction:: ISLP.models.model_spec.contrast
-.. autofunction:: ISLP.models.model_spec.derived_variable
+.. autofunction:: ISLP.models.model_spec.derived_feature
.. autofunction:: ISLP.models.model_spec.fit_encoder
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index 4734cda..8aededd 100644
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -1,12 +1,7 @@
-.. _api-index:
+ISLP reference
+--------------
-#####
- API
-#####
-.. only:: html
+.. toctree::
- :Release: |version|
- :Date: |today|
-
-.. include:: gen.rst
+ gen
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5da3dda..546d74f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -2,12 +2,32 @@
# -- Project information
+import json
+import os
+
project = 'ISLP'
copyright = '2023, ISLP authors'
author = 'Jonathan Taylor'
-release = '0.1'
-version = '0.1.0'
+import ISLP
+version = ISLP.__version__
+
+import __main__
+dirname = os.path.split(__file__)[0]
+print(dirname, 'dirname')
+
+docs_version = json.loads(open(os.path.join(dirname, 'docs_version.json')).read())
+lab_version = docs_version['labs']
+
+myst_enable_extensions = ['substitution']
+
+myst_substitutions = {
+ "ISLP_lab_link": f"[ISLP_labs/{lab_version}](https://github.com/intro-stat-learning/ISLP_labs/tree/{lab_version})",
+ "ISLP_zip_link": f"[ISLP_labs/{lab_version}.zip](https://github.com/intro-stat-learning/ISLP_labs/archive/refs/tags/{lab_version}.zip)",
+ "ISLP_binder_code": f"[](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/{lab_version})",
+ "ISLP_lab_version": "[ISLP/{0}](https://github.com/intro-stat-learning/ISLP/tree/{0})".format(docs_version['library'])
+ }
+myst_number_code_blocks = ['python', 'ipython3']
# -- General configuration
@@ -27,7 +47,16 @@
graphviz_dot = '/opt/homebrew/bin/dot'
numpydoc_class_members_toctree = False
-nb_execution_mode = "cache"
+nb_execution_mode = "auto"
+nb_execution_timeout = 60*20 #*100
+# labs will be built with specific commits of ISLP/ISLP_labs
+# we want Ch06 run to exlucde the warnings
+nb_execution_excludepatterns = (['imdb.ipynb'] +
+ [f'Ch{i:02d}*' for i in range(2, 14)])
+print('exclude patterns', nb_execution_excludepatterns)
+nb_execution_allow_errors = True
+
+#nb_kernel_rgx_aliases = {'python3': "islp_test"}
intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
@@ -42,7 +71,19 @@
# -- Options for HTML output
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_book_theme"
+html_theme_options = {
+ "repository_url": "https://github.com/intro-stat-learning/ISLP.git",
+ "use_repository_button": True,
+}
+html_title = "Introduction to Statistical Learning (Python)"
+html_logo = "logo.png"
+
+source_suffix = {
+ '.rst': 'restructuredtext',
+ '.ipynb': 'myst-nb',
+ '.myst': 'myst-nb',
+}
# -- Options for EPUB output
epub_show_urls = 'footnote'
diff --git a/docs/source/datasets/Auto.ipynb b/docs/source/datasets/Auto.ipynb
index b88ea02..b588844 100644
--- a/docs/source/datasets/Auto.ipynb
+++ b/docs/source/datasets/Auto.ipynb
@@ -44,7 +44,14 @@
"cell_type": "code",
"execution_count": null,
"id": "182ea1d1",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:43.883548Z",
+ "iopub.status.busy": "2023-07-26T12:47:43.883261Z",
+ "iopub.status.idle": "2023-07-26T12:47:44.433075Z",
+ "shell.execute_reply": "2023-07-26T12:47:44.432801Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -56,7 +63,14 @@
"cell_type": "code",
"execution_count": null,
"id": "979abd7e",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:44.434662Z",
+ "iopub.status.busy": "2023-07-26T12:47:44.434558Z",
+ "iopub.status.idle": "2023-07-26T12:47:44.436577Z",
+ "shell.execute_reply": "2023-07-26T12:47:44.436322Z"
+ }
+ },
"outputs": [],
"source": [
"Auto.shape"
@@ -66,7 +80,14 @@
"cell_type": "code",
"execution_count": null,
"id": "7444c0f0",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:44.438047Z",
+ "iopub.status.busy": "2023-07-26T12:47:44.437943Z",
+ "iopub.status.idle": "2023-07-26T12:47:44.439951Z",
+ "shell.execute_reply": "2023-07-26T12:47:44.439712Z"
+ }
+ },
"outputs": [],
"source": [
"Auto.columns"
@@ -76,7 +97,14 @@
"cell_type": "code",
"execution_count": null,
"id": "59b6e919",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:44.441257Z",
+ "iopub.status.busy": "2023-07-26T12:47:44.441161Z",
+ "iopub.status.idle": "2023-07-26T12:47:44.449658Z",
+ "shell.execute_reply": "2023-07-26T12:47:44.449426Z"
+ }
+ },
"outputs": [],
"source": [
"Auto.describe().iloc[:,:4]"
@@ -91,6 +119,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Bikeshare.ipynb b/docs/source/datasets/Bikeshare.ipynb
index ddb1053..ab42024 100644
--- a/docs/source/datasets/Bikeshare.ipynb
+++ b/docs/source/datasets/Bikeshare.ipynb
@@ -56,7 +56,14 @@
"cell_type": "code",
"execution_count": null,
"id": "bcdb89b6",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:19.462730Z",
+ "iopub.status.busy": "2023-07-26T12:47:19.461535Z",
+ "iopub.status.idle": "2023-07-26T12:47:20.022610Z",
+ "shell.execute_reply": "2023-07-26T12:47:20.022326Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -68,7 +75,14 @@
"cell_type": "code",
"execution_count": null,
"id": "72075fb0",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:20.024144Z",
+ "iopub.status.busy": "2023-07-26T12:47:20.024034Z",
+ "iopub.status.idle": "2023-07-26T12:47:20.026016Z",
+ "shell.execute_reply": "2023-07-26T12:47:20.025777Z"
+ }
+ },
"outputs": [],
"source": [
"Bikeshare.shape"
@@ -78,7 +92,14 @@
"cell_type": "code",
"execution_count": null,
"id": "45396d69",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:20.027480Z",
+ "iopub.status.busy": "2023-07-26T12:47:20.027378Z",
+ "iopub.status.idle": "2023-07-26T12:47:20.029427Z",
+ "shell.execute_reply": "2023-07-26T12:47:20.029199Z"
+ }
+ },
"outputs": [],
"source": [
"Bikeshare.columns"
@@ -88,7 +109,14 @@
"cell_type": "code",
"execution_count": null,
"id": "26c24d9a",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:20.030734Z",
+ "iopub.status.busy": "2023-07-26T12:47:20.030638Z",
+ "iopub.status.idle": "2023-07-26T12:47:20.042031Z",
+ "shell.execute_reply": "2023-07-26T12:47:20.041787Z"
+ }
+ },
"outputs": [],
"source": [
"Bikeshare.describe().iloc[:,:4]"
@@ -105,6 +133,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Boston.ipynb b/docs/source/datasets/Boston.ipynb
index 569f5b4..027585a 100644
--- a/docs/source/datasets/Boston.ipynb
+++ b/docs/source/datasets/Boston.ipynb
@@ -49,7 +49,14 @@
"cell_type": "code",
"execution_count": null,
"id": "b8bb96f0",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:31.625524Z",
+ "iopub.status.busy": "2023-07-26T12:47:31.625196Z",
+ "iopub.status.idle": "2023-07-26T12:47:32.177553Z",
+ "shell.execute_reply": "2023-07-26T12:47:32.177240Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -61,7 +68,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ab4b03f8",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:32.179272Z",
+ "iopub.status.busy": "2023-07-26T12:47:32.179157Z",
+ "iopub.status.idle": "2023-07-26T12:47:32.181230Z",
+ "shell.execute_reply": "2023-07-26T12:47:32.180964Z"
+ }
+ },
"outputs": [],
"source": [
"Boston.shape"
@@ -71,7 +85,14 @@
"cell_type": "code",
"execution_count": null,
"id": "74890e1f",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:32.182653Z",
+ "iopub.status.busy": "2023-07-26T12:47:32.182557Z",
+ "iopub.status.idle": "2023-07-26T12:47:32.184501Z",
+ "shell.execute_reply": "2023-07-26T12:47:32.184276Z"
+ }
+ },
"outputs": [],
"source": [
"Boston.columns"
@@ -81,7 +102,14 @@
"cell_type": "code",
"execution_count": null,
"id": "90ecf46f",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:32.185826Z",
+ "iopub.status.busy": "2023-07-26T12:47:32.185735Z",
+ "iopub.status.idle": "2023-07-26T12:47:32.198310Z",
+ "shell.execute_reply": "2023-07-26T12:47:32.198074Z"
+ }
+ },
"outputs": [],
"source": [
"Boston.describe()"
@@ -98,6 +126,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/BrainCancer.ipynb b/docs/source/datasets/BrainCancer.ipynb
index cb75946..89e8b2c 100644
--- a/docs/source/datasets/BrainCancer.ipynb
+++ b/docs/source/datasets/BrainCancer.ipynb
@@ -39,7 +39,14 @@
"cell_type": "code",
"execution_count": null,
"id": "519fa8cf",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:09.619445Z",
+ "iopub.status.busy": "2023-07-26T12:47:09.618768Z",
+ "iopub.status.idle": "2023-07-26T12:47:10.149955Z",
+ "shell.execute_reply": "2023-07-26T12:47:10.149508Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -51,7 +58,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ac7f1920",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:10.151658Z",
+ "iopub.status.busy": "2023-07-26T12:47:10.151541Z",
+ "iopub.status.idle": "2023-07-26T12:47:10.153944Z",
+ "shell.execute_reply": "2023-07-26T12:47:10.153658Z"
+ }
+ },
"outputs": [],
"source": [
"BrainCancer.shape"
@@ -61,7 +75,14 @@
"cell_type": "code",
"execution_count": null,
"id": "64b3177f",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:10.155433Z",
+ "iopub.status.busy": "2023-07-26T12:47:10.155323Z",
+ "iopub.status.idle": "2023-07-26T12:47:10.157819Z",
+ "shell.execute_reply": "2023-07-26T12:47:10.157458Z"
+ }
+ },
"outputs": [],
"source": [
"BrainCancer.columns"
@@ -71,7 +92,14 @@
"cell_type": "code",
"execution_count": null,
"id": "8132496d",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:10.159542Z",
+ "iopub.status.busy": "2023-07-26T12:47:10.159420Z",
+ "iopub.status.idle": "2023-07-26T12:47:10.166890Z",
+ "shell.execute_reply": "2023-07-26T12:47:10.166610Z"
+ }
+ },
"outputs": [],
"source": [
"BrainCancer.describe()"
@@ -81,7 +109,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ed04719d",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:10.168420Z",
+ "iopub.status.busy": "2023-07-26T12:47:10.168324Z",
+ "iopub.status.idle": "2023-07-26T12:47:10.171157Z",
+ "shell.execute_reply": "2023-07-26T12:47:10.170862Z"
+ }
+ },
"outputs": [],
"source": [
"BrainCancer['diagnosis'].value_counts()"
@@ -98,6 +133,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Caravan.ipynb b/docs/source/datasets/Caravan.ipynb
index f093422..ab39457 100644
--- a/docs/source/datasets/Caravan.ipynb
+++ b/docs/source/datasets/Caravan.ipynb
@@ -27,7 +27,14 @@
"cell_type": "code",
"execution_count": null,
"id": "1f9a6aaa",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:12.041705Z",
+ "iopub.status.busy": "2023-07-26T12:47:12.040979Z",
+ "iopub.status.idle": "2023-07-26T12:47:12.637566Z",
+ "shell.execute_reply": "2023-07-26T12:47:12.637297Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -39,7 +46,14 @@
"cell_type": "code",
"execution_count": null,
"id": "88755969",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:12.639146Z",
+ "iopub.status.busy": "2023-07-26T12:47:12.639031Z",
+ "iopub.status.idle": "2023-07-26T12:47:12.640881Z",
+ "shell.execute_reply": "2023-07-26T12:47:12.640666Z"
+ }
+ },
"outputs": [],
"source": [
"Caravan.shape"
@@ -49,7 +63,14 @@
"cell_type": "code",
"execution_count": null,
"id": "52ea2641",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:12.642281Z",
+ "iopub.status.busy": "2023-07-26T12:47:12.642186Z",
+ "iopub.status.idle": "2023-07-26T12:47:12.644243Z",
+ "shell.execute_reply": "2023-07-26T12:47:12.644020Z"
+ }
+ },
"outputs": [],
"source": [
"Caravan.columns[:20]"
@@ -66,6 +87,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Carseats.ipynb b/docs/source/datasets/Carseats.ipynb
index dfd36d4..92ff1b4 100644
--- a/docs/source/datasets/Carseats.ipynb
+++ b/docs/source/datasets/Carseats.ipynb
@@ -37,7 +37,14 @@
"cell_type": "code",
"execution_count": null,
"id": "984643c9",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:26.781289Z",
+ "iopub.status.busy": "2023-07-26T12:47:26.780964Z",
+ "iopub.status.idle": "2023-07-26T12:47:27.314225Z",
+ "shell.execute_reply": "2023-07-26T12:47:27.313885Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -49,7 +56,14 @@
"cell_type": "code",
"execution_count": null,
"id": "663f5f6a",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:27.316055Z",
+ "iopub.status.busy": "2023-07-26T12:47:27.315854Z",
+ "iopub.status.idle": "2023-07-26T12:47:27.318176Z",
+ "shell.execute_reply": "2023-07-26T12:47:27.317912Z"
+ }
+ },
"outputs": [],
"source": [
"Carseats.shape"
@@ -59,7 +73,14 @@
"cell_type": "code",
"execution_count": null,
"id": "386299b2",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:27.319606Z",
+ "iopub.status.busy": "2023-07-26T12:47:27.319504Z",
+ "iopub.status.idle": "2023-07-26T12:47:27.321648Z",
+ "shell.execute_reply": "2023-07-26T12:47:27.321403Z"
+ }
+ },
"outputs": [],
"source": [
"Carseats.columns"
@@ -69,7 +90,14 @@
"cell_type": "code",
"execution_count": null,
"id": "5c8c69c8",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:27.323205Z",
+ "iopub.status.busy": "2023-07-26T12:47:27.323091Z",
+ "iopub.status.idle": "2023-07-26T12:47:27.331921Z",
+ "shell.execute_reply": "2023-07-26T12:47:27.331627Z"
+ }
+ },
"outputs": [],
"source": [
"Carseats.describe().iloc[:,:4]"
@@ -86,6 +114,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/College.ipynb b/docs/source/datasets/College.ipynb
index af1027d..27a4d1d 100644
--- a/docs/source/datasets/College.ipynb
+++ b/docs/source/datasets/College.ipynb
@@ -58,7 +58,14 @@
"cell_type": "code",
"execution_count": null,
"id": "680ceb3e",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:17.006699Z",
+ "iopub.status.busy": "2023-07-26T12:47:17.006226Z",
+ "iopub.status.idle": "2023-07-26T12:47:17.561114Z",
+ "shell.execute_reply": "2023-07-26T12:47:17.560739Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -70,7 +77,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ccdf3e4f",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:17.563075Z",
+ "iopub.status.busy": "2023-07-26T12:47:17.562947Z",
+ "iopub.status.idle": "2023-07-26T12:47:17.565074Z",
+ "shell.execute_reply": "2023-07-26T12:47:17.564824Z"
+ }
+ },
"outputs": [],
"source": [
"College.shape"
@@ -80,7 +94,14 @@
"cell_type": "code",
"execution_count": null,
"id": "09f59747",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:17.566389Z",
+ "iopub.status.busy": "2023-07-26T12:47:17.566297Z",
+ "iopub.status.idle": "2023-07-26T12:47:17.568257Z",
+ "shell.execute_reply": "2023-07-26T12:47:17.568025Z"
+ }
+ },
"outputs": [],
"source": [
"College.columns"
@@ -90,7 +111,14 @@
"cell_type": "code",
"execution_count": null,
"id": "6a48dfd5",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:17.569585Z",
+ "iopub.status.busy": "2023-07-26T12:47:17.569492Z",
+ "iopub.status.idle": "2023-07-26T12:47:17.582384Z",
+ "shell.execute_reply": "2023-07-26T12:47:17.582154Z"
+ }
+ },
"outputs": [],
"source": [
"College.describe().iloc[:,:4]"
@@ -107,6 +135,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Credit.ipynb b/docs/source/datasets/Credit.ipynb
index f5e51a9..d604aaa 100644
--- a/docs/source/datasets/Credit.ipynb
+++ b/docs/source/datasets/Credit.ipynb
@@ -43,7 +43,14 @@
"cell_type": "code",
"execution_count": null,
"id": "c4895446",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:39.024610Z",
+ "iopub.status.busy": "2023-07-26T12:47:39.024341Z",
+ "iopub.status.idle": "2023-07-26T12:47:39.593395Z",
+ "shell.execute_reply": "2023-07-26T12:47:39.593133Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -55,7 +62,14 @@
"cell_type": "code",
"execution_count": null,
"id": "c738c66b",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:39.595074Z",
+ "iopub.status.busy": "2023-07-26T12:47:39.594871Z",
+ "iopub.status.idle": "2023-07-26T12:47:39.596893Z",
+ "shell.execute_reply": "2023-07-26T12:47:39.596667Z"
+ }
+ },
"outputs": [],
"source": [
"Credit.shape"
@@ -65,7 +79,14 @@
"cell_type": "code",
"execution_count": null,
"id": "d612f5a7",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:39.598266Z",
+ "iopub.status.busy": "2023-07-26T12:47:39.598173Z",
+ "iopub.status.idle": "2023-07-26T12:47:39.600134Z",
+ "shell.execute_reply": "2023-07-26T12:47:39.599913Z"
+ }
+ },
"outputs": [],
"source": [
"Credit.columns"
@@ -75,7 +96,14 @@
"cell_type": "code",
"execution_count": null,
"id": "45633b1a",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:39.601442Z",
+ "iopub.status.busy": "2023-07-26T12:47:39.601344Z",
+ "iopub.status.idle": "2023-07-26T12:47:39.609927Z",
+ "shell.execute_reply": "2023-07-26T12:47:39.609656Z"
+ }
+ },
"outputs": [],
"source": [
"Credit.describe().iloc[:,:4]"
@@ -92,6 +120,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Default.ipynb b/docs/source/datasets/Default.ipynb
index 64357ef..8023d39 100644
--- a/docs/source/datasets/Default.ipynb
+++ b/docs/source/datasets/Default.ipynb
@@ -27,7 +27,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ab810dee",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:36.566964Z",
+ "iopub.status.busy": "2023-07-26T12:47:36.566691Z",
+ "iopub.status.idle": "2023-07-26T12:47:37.127499Z",
+ "shell.execute_reply": "2023-07-26T12:47:37.127183Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -39,7 +46,14 @@
"cell_type": "code",
"execution_count": null,
"id": "086ef3a2",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:37.129114Z",
+ "iopub.status.busy": "2023-07-26T12:47:37.129003Z",
+ "iopub.status.idle": "2023-07-26T12:47:37.131023Z",
+ "shell.execute_reply": "2023-07-26T12:47:37.130778Z"
+ }
+ },
"outputs": [],
"source": [
"Default.shape"
@@ -49,7 +63,14 @@
"cell_type": "code",
"execution_count": null,
"id": "6600c13b",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:37.132471Z",
+ "iopub.status.busy": "2023-07-26T12:47:37.132373Z",
+ "iopub.status.idle": "2023-07-26T12:47:37.134281Z",
+ "shell.execute_reply": "2023-07-26T12:47:37.134067Z"
+ }
+ },
"outputs": [],
"source": [
"Default.columns"
@@ -59,7 +80,14 @@
"cell_type": "code",
"execution_count": null,
"id": "09e98840",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:37.135578Z",
+ "iopub.status.busy": "2023-07-26T12:47:37.135480Z",
+ "iopub.status.idle": "2023-07-26T12:47:37.141213Z",
+ "shell.execute_reply": "2023-07-26T12:47:37.140974Z"
+ }
+ },
"outputs": [],
"source": [
"Default.describe()"
@@ -69,7 +97,14 @@
"cell_type": "code",
"execution_count": null,
"id": "425f0cb1",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:37.142597Z",
+ "iopub.status.busy": "2023-07-26T12:47:37.142519Z",
+ "iopub.status.idle": "2023-07-26T12:47:37.145148Z",
+ "shell.execute_reply": "2023-07-26T12:47:37.144915Z"
+ }
+ },
"outputs": [],
"source": [
"Default['student'].value_counts()"
@@ -86,6 +121,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Fund.ipynb b/docs/source/datasets/Fund.ipynb
index fce1859..2e5dcb5 100644
--- a/docs/source/datasets/Fund.ipynb
+++ b/docs/source/datasets/Fund.ipynb
@@ -15,7 +15,14 @@
"cell_type": "code",
"execution_count": null,
"id": "5eba8e49",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:59.809785Z",
+ "iopub.status.busy": "2023-07-26T12:46:59.809389Z",
+ "iopub.status.idle": "2023-07-26T12:47:00.410897Z",
+ "shell.execute_reply": "2023-07-26T12:47:00.410627Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -27,7 +34,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ced3b335",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:00.412492Z",
+ "iopub.status.busy": "2023-07-26T12:47:00.412385Z",
+ "iopub.status.idle": "2023-07-26T12:47:00.414444Z",
+ "shell.execute_reply": "2023-07-26T12:47:00.414168Z"
+ }
+ },
"outputs": [],
"source": [
"Fund.shape"
@@ -37,7 +51,14 @@
"cell_type": "code",
"execution_count": null,
"id": "bfff1ac6",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:00.415891Z",
+ "iopub.status.busy": "2023-07-26T12:47:00.415789Z",
+ "iopub.status.idle": "2023-07-26T12:47:00.417755Z",
+ "shell.execute_reply": "2023-07-26T12:47:00.417529Z"
+ }
+ },
"outputs": [],
"source": [
"Fund.columns"
@@ -54,6 +75,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Hitters.ipynb b/docs/source/datasets/Hitters.ipynb
index 6f261cd..5af634c 100644
--- a/docs/source/datasets/Hitters.ipynb
+++ b/docs/source/datasets/Hitters.ipynb
@@ -64,7 +64,14 @@
"cell_type": "code",
"execution_count": null,
"id": "4fa187f0",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:34.072657Z",
+ "iopub.status.busy": "2023-07-26T12:47:34.072382Z",
+ "iopub.status.idle": "2023-07-26T12:47:34.654518Z",
+ "shell.execute_reply": "2023-07-26T12:47:34.654230Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -76,7 +83,14 @@
"cell_type": "code",
"execution_count": null,
"id": "04535ffb",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:34.656071Z",
+ "iopub.status.busy": "2023-07-26T12:47:34.655969Z",
+ "iopub.status.idle": "2023-07-26T12:47:34.657899Z",
+ "shell.execute_reply": "2023-07-26T12:47:34.657674Z"
+ }
+ },
"outputs": [],
"source": [
"Hitters.shape"
@@ -86,7 +100,14 @@
"cell_type": "code",
"execution_count": null,
"id": "6875aac6",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:34.659335Z",
+ "iopub.status.busy": "2023-07-26T12:47:34.659236Z",
+ "iopub.status.idle": "2023-07-26T12:47:34.661182Z",
+ "shell.execute_reply": "2023-07-26T12:47:34.660944Z"
+ }
+ },
"outputs": [],
"source": [
"Hitters.columns"
@@ -96,7 +117,14 @@
"cell_type": "code",
"execution_count": null,
"id": "9e2cffc8",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:34.662645Z",
+ "iopub.status.busy": "2023-07-26T12:47:34.662543Z",
+ "iopub.status.idle": "2023-07-26T12:47:34.674958Z",
+ "shell.execute_reply": "2023-07-26T12:47:34.674698Z"
+ }
+ },
"outputs": [],
"source": [
"Hitters.describe().iloc[:,:4]"
@@ -113,6 +141,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Khan.ipynb b/docs/source/datasets/Khan.ipynb
index f12a5ca..c1ce7bf 100644
--- a/docs/source/datasets/Khan.ipynb
+++ b/docs/source/datasets/Khan.ipynb
@@ -43,7 +43,14 @@
"cell_type": "code",
"execution_count": null,
"id": "bfda6cad",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:53.879692Z",
+ "iopub.status.busy": "2023-07-26T12:46:53.879072Z",
+ "iopub.status.idle": "2023-07-26T12:46:54.473904Z",
+ "shell.execute_reply": "2023-07-26T12:46:54.473562Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -55,7 +62,14 @@
"cell_type": "code",
"execution_count": null,
"id": "70514dc5",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:54.475443Z",
+ "iopub.status.busy": "2023-07-26T12:46:54.475340Z",
+ "iopub.status.idle": "2023-07-26T12:46:54.477103Z",
+ "shell.execute_reply": "2023-07-26T12:46:54.476883Z"
+ }
+ },
"outputs": [],
"source": [
"for X in ['xtest', 'xtrain']:\n",
@@ -66,7 +80,14 @@
"cell_type": "code",
"execution_count": null,
"id": "e9df5de8",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:54.478408Z",
+ "iopub.status.busy": "2023-07-26T12:46:54.478336Z",
+ "iopub.status.idle": "2023-07-26T12:46:54.480540Z",
+ "shell.execute_reply": "2023-07-26T12:46:54.480299Z"
+ }
+ },
"outputs": [],
"source": [
"for Y in ['ytest', 'ytrain']:\n",
@@ -84,6 +105,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/NCI60.ipynb b/docs/source/datasets/NCI60.ipynb
index bbb576f..b38f981 100644
--- a/docs/source/datasets/NCI60.ipynb
+++ b/docs/source/datasets/NCI60.ipynb
@@ -26,7 +26,14 @@
"cell_type": "code",
"execution_count": null,
"id": "c88c2eaf",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:07.189429Z",
+ "iopub.status.busy": "2023-07-26T12:47:07.188891Z",
+ "iopub.status.idle": "2023-07-26T12:47:07.734853Z",
+ "shell.execute_reply": "2023-07-26T12:47:07.734392Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -38,7 +45,14 @@
"cell_type": "code",
"execution_count": null,
"id": "0e6279ad",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:07.736643Z",
+ "iopub.status.busy": "2023-07-26T12:47:07.736477Z",
+ "iopub.status.idle": "2023-07-26T12:47:07.740295Z",
+ "shell.execute_reply": "2023-07-26T12:47:07.739954Z"
+ }
+ },
"outputs": [],
"source": [
"NCI60['labels'].value_counts()"
@@ -48,7 +62,14 @@
"cell_type": "code",
"execution_count": null,
"id": "ed5ddd2f",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:07.741963Z",
+ "iopub.status.busy": "2023-07-26T12:47:07.741866Z",
+ "iopub.status.idle": "2023-07-26T12:47:07.744496Z",
+ "shell.execute_reply": "2023-07-26T12:47:07.744146Z"
+ }
+ },
"outputs": [],
"source": [
"NCI60['data'].shape"
@@ -65,6 +86,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/NYSE.ipynb b/docs/source/datasets/NYSE.ipynb
index 5f9dbd5..4fb6ea5 100644
--- a/docs/source/datasets/NYSE.ipynb
+++ b/docs/source/datasets/NYSE.ipynb
@@ -33,7 +33,14 @@
"cell_type": "code",
"execution_count": null,
"id": "fcff6c95",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:24.365935Z",
+ "iopub.status.busy": "2023-07-26T12:47:24.365648Z",
+ "iopub.status.idle": "2023-07-26T12:47:24.910157Z",
+ "shell.execute_reply": "2023-07-26T12:47:24.909886Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -45,7 +52,14 @@
"cell_type": "code",
"execution_count": null,
"id": "84426961",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:24.911976Z",
+ "iopub.status.busy": "2023-07-26T12:47:24.911859Z",
+ "iopub.status.idle": "2023-07-26T12:47:24.913899Z",
+ "shell.execute_reply": "2023-07-26T12:47:24.913685Z"
+ }
+ },
"outputs": [],
"source": [
"NYSE.shape"
@@ -55,7 +69,14 @@
"cell_type": "code",
"execution_count": null,
"id": "e6194a8c",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:24.915295Z",
+ "iopub.status.busy": "2023-07-26T12:47:24.915180Z",
+ "iopub.status.idle": "2023-07-26T12:47:24.917209Z",
+ "shell.execute_reply": "2023-07-26T12:47:24.916991Z"
+ }
+ },
"outputs": [],
"source": [
"NYSE.columns"
@@ -65,7 +86,14 @@
"cell_type": "code",
"execution_count": null,
"id": "0c7bf3d7",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:24.918571Z",
+ "iopub.status.busy": "2023-07-26T12:47:24.918468Z",
+ "iopub.status.idle": "2023-07-26T12:47:24.924914Z",
+ "shell.execute_reply": "2023-07-26T12:47:24.924671Z"
+ }
+ },
"outputs": [],
"source": [
"NYSE.describe()"
@@ -82,6 +110,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/OJ.ipynb b/docs/source/datasets/OJ.ipynb
index e18a4de..55ffeb9 100644
--- a/docs/source/datasets/OJ.ipynb
+++ b/docs/source/datasets/OJ.ipynb
@@ -61,7 +61,14 @@
"cell_type": "code",
"execution_count": null,
"id": "609742da",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:14.553008Z",
+ "iopub.status.busy": "2023-07-26T12:47:14.551694Z",
+ "iopub.status.idle": "2023-07-26T12:47:15.102658Z",
+ "shell.execute_reply": "2023-07-26T12:47:15.102334Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -73,7 +80,14 @@
"cell_type": "code",
"execution_count": null,
"id": "6f195dcd",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:15.104419Z",
+ "iopub.status.busy": "2023-07-26T12:47:15.104301Z",
+ "iopub.status.idle": "2023-07-26T12:47:15.106415Z",
+ "shell.execute_reply": "2023-07-26T12:47:15.106177Z"
+ }
+ },
"outputs": [],
"source": [
"OJ.shape"
@@ -83,7 +97,14 @@
"cell_type": "code",
"execution_count": null,
"id": "aaafb83b",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:15.107821Z",
+ "iopub.status.busy": "2023-07-26T12:47:15.107723Z",
+ "iopub.status.idle": "2023-07-26T12:47:15.109747Z",
+ "shell.execute_reply": "2023-07-26T12:47:15.109486Z"
+ }
+ },
"outputs": [],
"source": [
"OJ.columns"
@@ -93,7 +114,14 @@
"cell_type": "code",
"execution_count": null,
"id": "774dfa86",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:15.111066Z",
+ "iopub.status.busy": "2023-07-26T12:47:15.110974Z",
+ "iopub.status.idle": "2023-07-26T12:47:15.123225Z",
+ "shell.execute_reply": "2023-07-26T12:47:15.122965Z"
+ }
+ },
"outputs": [],
"source": [
"OJ.describe().iloc[:,:4]"
@@ -110,6 +138,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Portfolio.ipynb b/docs/source/datasets/Portfolio.ipynb
index 6d6a60d..1a6d711 100644
--- a/docs/source/datasets/Portfolio.ipynb
+++ b/docs/source/datasets/Portfolio.ipynb
@@ -22,7 +22,14 @@
"cell_type": "code",
"execution_count": null,
"id": "3adff220",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:02.309375Z",
+ "iopub.status.busy": "2023-07-26T12:47:02.308873Z",
+ "iopub.status.idle": "2023-07-26T12:47:02.849537Z",
+ "shell.execute_reply": "2023-07-26T12:47:02.849247Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -34,7 +41,14 @@
"cell_type": "code",
"execution_count": null,
"id": "b02a9e67",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:02.851392Z",
+ "iopub.status.busy": "2023-07-26T12:47:02.851244Z",
+ "iopub.status.idle": "2023-07-26T12:47:02.853779Z",
+ "shell.execute_reply": "2023-07-26T12:47:02.853348Z"
+ }
+ },
"outputs": [],
"source": [
"Portfolio.shape"
@@ -44,7 +58,14 @@
"cell_type": "code",
"execution_count": null,
"id": "3e83a0ed",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:02.855660Z",
+ "iopub.status.busy": "2023-07-26T12:47:02.855540Z",
+ "iopub.status.idle": "2023-07-26T12:47:02.858065Z",
+ "shell.execute_reply": "2023-07-26T12:47:02.857779Z"
+ }
+ },
"outputs": [],
"source": [
"Portfolio.columns"
@@ -54,7 +75,14 @@
"cell_type": "code",
"execution_count": null,
"id": "3ebec412",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:02.859606Z",
+ "iopub.status.busy": "2023-07-26T12:47:02.859503Z",
+ "iopub.status.idle": "2023-07-26T12:47:02.865754Z",
+ "shell.execute_reply": "2023-07-26T12:47:02.865418Z"
+ }
+ },
"outputs": [],
"source": [
"Portfolio.describe()"
@@ -71,6 +99,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Publication.ipynb b/docs/source/datasets/Publication.ipynb
index a4a6dfa..de4a449 100644
--- a/docs/source/datasets/Publication.ipynb
+++ b/docs/source/datasets/Publication.ipynb
@@ -45,7 +45,14 @@
"cell_type": "code",
"execution_count": null,
"id": "61d7c2b3",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:29.196850Z",
+ "iopub.status.busy": "2023-07-26T12:47:29.196559Z",
+ "iopub.status.idle": "2023-07-26T12:47:29.727827Z",
+ "shell.execute_reply": "2023-07-26T12:47:29.727421Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -57,7 +64,14 @@
"cell_type": "code",
"execution_count": null,
"id": "4d72460d",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:29.729844Z",
+ "iopub.status.busy": "2023-07-26T12:47:29.729686Z",
+ "iopub.status.idle": "2023-07-26T12:47:29.732275Z",
+ "shell.execute_reply": "2023-07-26T12:47:29.732008Z"
+ }
+ },
"outputs": [],
"source": [
"Publication.shape"
@@ -67,7 +81,14 @@
"cell_type": "code",
"execution_count": null,
"id": "fd34224c",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:29.734028Z",
+ "iopub.status.busy": "2023-07-26T12:47:29.733885Z",
+ "iopub.status.idle": "2023-07-26T12:47:29.736365Z",
+ "shell.execute_reply": "2023-07-26T12:47:29.736014Z"
+ }
+ },
"outputs": [],
"source": [
"Publication.columns"
@@ -77,7 +98,14 @@
"cell_type": "code",
"execution_count": null,
"id": "51bfb0aa",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:29.738169Z",
+ "iopub.status.busy": "2023-07-26T12:47:29.738046Z",
+ "iopub.status.idle": "2023-07-26T12:47:29.747027Z",
+ "shell.execute_reply": "2023-07-26T12:47:29.746722Z"
+ }
+ },
"outputs": [],
"source": [
"Publication.describe().iloc[:,:4]"
@@ -94,6 +122,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Smarket.ipynb b/docs/source/datasets/Smarket.ipynb
index cced2a9..0be4dd9 100644
--- a/docs/source/datasets/Smarket.ipynb
+++ b/docs/source/datasets/Smarket.ipynb
@@ -41,7 +41,14 @@
"cell_type": "code",
"execution_count": null,
"id": "3d920337",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:21.928355Z",
+ "iopub.status.busy": "2023-07-26T12:47:21.927766Z",
+ "iopub.status.idle": "2023-07-26T12:47:22.480597Z",
+ "shell.execute_reply": "2023-07-26T12:47:22.480297Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -53,7 +60,14 @@
"cell_type": "code",
"execution_count": null,
"id": "25d90138",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:22.482125Z",
+ "iopub.status.busy": "2023-07-26T12:47:22.482016Z",
+ "iopub.status.idle": "2023-07-26T12:47:22.484017Z",
+ "shell.execute_reply": "2023-07-26T12:47:22.483801Z"
+ }
+ },
"outputs": [],
"source": [
"Smarket.shape"
@@ -63,7 +77,14 @@
"cell_type": "code",
"execution_count": null,
"id": "0e8c57de",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:22.485456Z",
+ "iopub.status.busy": "2023-07-26T12:47:22.485359Z",
+ "iopub.status.idle": "2023-07-26T12:47:22.487416Z",
+ "shell.execute_reply": "2023-07-26T12:47:22.487186Z"
+ }
+ },
"outputs": [],
"source": [
"Smarket.columns"
@@ -73,7 +94,14 @@
"cell_type": "code",
"execution_count": null,
"id": "2d455f1e",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:22.488803Z",
+ "iopub.status.busy": "2023-07-26T12:47:22.488706Z",
+ "iopub.status.idle": "2023-07-26T12:47:22.497401Z",
+ "shell.execute_reply": "2023-07-26T12:47:22.497165Z"
+ }
+ },
"outputs": [],
"source": [
"Smarket.describe().iloc[:,-4:]"
@@ -90,6 +118,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/USArrests.ipynb b/docs/source/datasets/USArrests.ipynb
index 1107424..d860098 100644
--- a/docs/source/datasets/USArrests.ipynb
+++ b/docs/source/datasets/USArrests.ipynb
@@ -28,9 +28,16 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "feab45d4-ce30-4ea9-800c-bbe9e7c11f6d",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:56.351520Z",
+ "iopub.status.busy": "2023-07-26T12:46:56.350481Z",
+ "iopub.status.idle": "2023-07-26T12:46:58.021100Z",
+ "shell.execute_reply": "2023-07-26T12:46:58.019698Z"
+ }
+ },
"outputs": [],
"source": [
"from statsmodels.datasets import get_rdataset\n",
@@ -39,157 +46,51 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "bdfffad4-6ab1-45da-8d62-8a7c4326fb24",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(50, 4)"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:58.027241Z",
+ "iopub.status.busy": "2023-07-26T12:46:58.026857Z",
+ "iopub.status.idle": "2023-07-26T12:46:58.034424Z",
+ "shell.execute_reply": "2023-07-26T12:46:58.033781Z"
}
- ],
+ },
+ "outputs": [],
"source": [
"USArrests.shape"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"id": "02f28a67-e8b9-4a17-ad0d-88672e1de26d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Murder', 'Assault', 'UrbanPop', 'Rape'], dtype='object')"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:58.038173Z",
+ "iopub.status.busy": "2023-07-26T12:46:58.037943Z",
+ "iopub.status.idle": "2023-07-26T12:46:58.041828Z",
+ "shell.execute_reply": "2023-07-26T12:46:58.041345Z"
}
- ],
+ },
+ "outputs": [],
"source": [
"USArrests.columns"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"id": "711db396-64d6-4fbd-9be4-bebe4117216f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Murder | \n",
- " Assault | \n",
- " UrbanPop | \n",
- " Rape | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | count | \n",
- " 50.00000 | \n",
- " 50.000000 | \n",
- " 50.000000 | \n",
- " 50.000000 | \n",
- "
\n",
- " \n",
- " | mean | \n",
- " 7.78800 | \n",
- " 170.760000 | \n",
- " 65.540000 | \n",
- " 21.232000 | \n",
- "
\n",
- " \n",
- " | std | \n",
- " 4.35551 | \n",
- " 83.337661 | \n",
- " 14.474763 | \n",
- " 9.366385 | \n",
- "
\n",
- " \n",
- " | min | \n",
- " 0.80000 | \n",
- " 45.000000 | \n",
- " 32.000000 | \n",
- " 7.300000 | \n",
- "
\n",
- " \n",
- " | 25% | \n",
- " 4.07500 | \n",
- " 109.000000 | \n",
- " 54.500000 | \n",
- " 15.075000 | \n",
- "
\n",
- " \n",
- " | 50% | \n",
- " 7.25000 | \n",
- " 159.000000 | \n",
- " 66.000000 | \n",
- " 20.100000 | \n",
- "
\n",
- " \n",
- " | 75% | \n",
- " 11.25000 | \n",
- " 249.000000 | \n",
- " 77.750000 | \n",
- " 26.175000 | \n",
- "
\n",
- " \n",
- " | max | \n",
- " 17.40000 | \n",
- " 337.000000 | \n",
- " 91.000000 | \n",
- " 46.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Murder Assault UrbanPop Rape\n",
- "count 50.00000 50.000000 50.000000 50.000000\n",
- "mean 7.78800 170.760000 65.540000 21.232000\n",
- "std 4.35551 83.337661 14.474763 9.366385\n",
- "min 0.80000 45.000000 32.000000 7.300000\n",
- "25% 4.07500 109.000000 54.500000 15.075000\n",
- "50% 7.25000 159.000000 66.000000 20.100000\n",
- "75% 11.25000 249.000000 77.750000 26.175000\n",
- "max 17.40000 337.000000 91.000000 46.000000"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:58.044543Z",
+ "iopub.status.busy": "2023-07-26T12:46:58.044381Z",
+ "iopub.status.idle": "2023-07-26T12:46:58.057559Z",
+ "shell.execute_reply": "2023-07-26T12:46:58.057142Z"
}
- ],
+ },
+ "outputs": [],
"source": [
"USArrests.describe()"
]
@@ -216,7 +117,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Wage.ipynb b/docs/source/datasets/Wage.ipynb
index b95d853..28bb484 100644
--- a/docs/source/datasets/Wage.ipynb
+++ b/docs/source/datasets/Wage.ipynb
@@ -53,7 +53,14 @@
"cell_type": "code",
"execution_count": null,
"id": "6832d321",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:04.731864Z",
+ "iopub.status.busy": "2023-07-26T12:47:04.731413Z",
+ "iopub.status.idle": "2023-07-26T12:47:05.295785Z",
+ "shell.execute_reply": "2023-07-26T12:47:05.295452Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -65,7 +72,14 @@
"cell_type": "code",
"execution_count": null,
"id": "1c1ad3f3",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:05.297482Z",
+ "iopub.status.busy": "2023-07-26T12:47:05.297357Z",
+ "iopub.status.idle": "2023-07-26T12:47:05.299508Z",
+ "shell.execute_reply": "2023-07-26T12:47:05.299247Z"
+ }
+ },
"outputs": [],
"source": [
"Wage.shape"
@@ -75,7 +89,14 @@
"cell_type": "code",
"execution_count": null,
"id": "d56ab6a4",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:05.300989Z",
+ "iopub.status.busy": "2023-07-26T12:47:05.300875Z",
+ "iopub.status.idle": "2023-07-26T12:47:05.303024Z",
+ "shell.execute_reply": "2023-07-26T12:47:05.302786Z"
+ }
+ },
"outputs": [],
"source": [
"Wage.columns"
@@ -85,7 +106,14 @@
"cell_type": "code",
"execution_count": null,
"id": "5f021939",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:05.304606Z",
+ "iopub.status.busy": "2023-07-26T12:47:05.304487Z",
+ "iopub.status.idle": "2023-07-26T12:47:05.311771Z",
+ "shell.execute_reply": "2023-07-26T12:47:05.311522Z"
+ }
+ },
"outputs": [],
"source": [
"Wage.describe()"
@@ -102,6 +130,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/datasets/Weekly.ipynb b/docs/source/datasets/Weekly.ipynb
index 69f26d6..15a1050 100644
--- a/docs/source/datasets/Weekly.ipynb
+++ b/docs/source/datasets/Weekly.ipynb
@@ -41,7 +41,14 @@
"cell_type": "code",
"execution_count": null,
"id": "d19dd431",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:41.468580Z",
+ "iopub.status.busy": "2023-07-26T12:47:41.468291Z",
+ "iopub.status.idle": "2023-07-26T12:47:41.999679Z",
+ "shell.execute_reply": "2023-07-26T12:47:41.999341Z"
+ }
+ },
"outputs": [],
"source": [
"from ISLP import load_data\n",
@@ -53,7 +60,14 @@
"cell_type": "code",
"execution_count": null,
"id": "17d2cda4",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:42.002632Z",
+ "iopub.status.busy": "2023-07-26T12:47:42.002470Z",
+ "iopub.status.idle": "2023-07-26T12:47:42.004871Z",
+ "shell.execute_reply": "2023-07-26T12:47:42.004611Z"
+ }
+ },
"outputs": [],
"source": [
"Weekly.shape"
@@ -63,7 +77,14 @@
"cell_type": "code",
"execution_count": null,
"id": "f822715b",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:42.006534Z",
+ "iopub.status.busy": "2023-07-26T12:47:42.006422Z",
+ "iopub.status.idle": "2023-07-26T12:47:42.008496Z",
+ "shell.execute_reply": "2023-07-26T12:47:42.008187Z"
+ }
+ },
"outputs": [],
"source": [
"Weekly.columns"
@@ -73,7 +94,14 @@
"cell_type": "code",
"execution_count": null,
"id": "9a5f4d04",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:47:42.010010Z",
+ "iopub.status.busy": "2023-07-26T12:47:42.009911Z",
+ "iopub.status.idle": "2023-07-26T12:47:42.019036Z",
+ "shell.execute_reply": "2023-07-26T12:47:42.018706Z"
+ }
+ },
"outputs": [],
"source": [
"Weekly.describe().iloc[:,:4]"
@@ -98,6 +126,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/docs_version.json b/docs/source/docs_version.json
new file mode 100644
index 0000000..d6217ce
--- /dev/null
+++ b/docs/source/docs_version.json
@@ -0,0 +1,4 @@
+{"labs": "v2.2",
+ "library": "v0.4.0",
+ "comment":"labs should be version of ISLP pointed to in ISLP_labs/README.md, library version should be explicitly marked in ISLP_labs/requirements.txt; don't forget to strip warnings!!!!!!!!"
+}
diff --git a/docs/source/helpers/cluster.ipynb b/docs/source/helpers/cluster.ipynb
index 56cf3d8..4aa7de3 100644
--- a/docs/source/helpers/cluster.ipynb
+++ b/docs/source/helpers/cluster.ipynb
@@ -8,14 +8,21 @@
"# Clustering\n",
"\n",
"This module has a single function, used to help visualize a dendrogram from a\n",
- "hierarchical clustering."
+ "hierarchical clustering. The function is based on this example from [sklearn.cluster](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5df152d",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:42.214971Z",
+ "iopub.status.busy": "2023-07-26T12:46:42.214537Z",
+ "iopub.status.idle": "2023-07-26T12:46:42.860533Z",
+ "shell.execute_reply": "2023-07-26T12:46:42.860243Z"
+ }
+ },
"outputs": [],
"source": [
"import numpy as np\n",
@@ -36,7 +43,14 @@
"cell_type": "code",
"execution_count": null,
"id": "0135c1fb",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:42.862401Z",
+ "iopub.status.busy": "2023-07-26T12:46:42.862250Z",
+ "iopub.status.idle": "2023-07-26T12:46:42.864336Z",
+ "shell.execute_reply": "2023-07-26T12:46:42.864118Z"
+ }
+ },
"outputs": [],
"source": [
"rng = np.random.default_rng(1)\n",
@@ -56,7 +70,14 @@
"cell_type": "code",
"execution_count": null,
"id": "17c52650",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:42.865831Z",
+ "iopub.status.busy": "2023-07-26T12:46:42.865731Z",
+ "iopub.status.idle": "2023-07-26T12:46:42.867386Z",
+ "shell.execute_reply": "2023-07-26T12:46:42.867147Z"
+ }
+ },
"outputs": [],
"source": [
"clust = AgglomerativeClustering(distance_threshold=0,\n",
@@ -68,7 +89,14 @@
"cell_type": "code",
"execution_count": null,
"id": "a3ae2622",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:42.868746Z",
+ "iopub.status.busy": "2023-07-26T12:46:42.868668Z",
+ "iopub.status.idle": "2023-07-26T12:46:42.872497Z",
+ "shell.execute_reply": "2023-07-26T12:46:42.872240Z"
+ }
+ },
"outputs": [],
"source": [
"clust.fit(X)"
@@ -86,7 +114,14 @@
"cell_type": "code",
"execution_count": null,
"id": "64e726a4",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:42.873930Z",
+ "iopub.status.busy": "2023-07-26T12:46:42.873845Z",
+ "iopub.status.idle": "2023-07-26T12:46:43.195508Z",
+ "shell.execute_reply": "2023-07-26T12:46:43.195084Z"
+ }
+ },
"outputs": [],
"source": [
"linkage = compute_linkage(clust)\n",
@@ -101,9 +136,21 @@
"main_language": "python"
},
"kernelspec": {
- "display_name": "python3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/helpers/pygam.ipynb b/docs/source/helpers/pygam.ipynb
index aab61d1..b452294 100644
--- a/docs/source/helpers/pygam.ipynb
+++ b/docs/source/helpers/pygam.ipynb
@@ -16,7 +16,14 @@
"cell_type": "code",
"execution_count": null,
"id": "9a52fb27",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:47.820912Z",
+ "iopub.status.busy": "2023-07-26T12:46:47.820490Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.577304Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.577007Z"
+ }
+ },
"outputs": [],
"source": [
"import numpy as np\n",
@@ -46,7 +53,14 @@
"cell_type": "code",
"execution_count": null,
"id": "4bddce77",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.579295Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.579114Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.581608Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.581355Z"
+ }
+ },
"outputs": [],
"source": [
"rng = np.random.default_rng(1)\n",
@@ -69,7 +83,14 @@
"cell_type": "code",
"execution_count": null,
"id": "3f8946e0",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.583287Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.583187Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.618486Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.614888Z"
+ }
+ },
"outputs": [],
"source": [
"terms = [s(f, lam=0.01) for f in range(3)]\n",
@@ -91,7 +112,14 @@
"cell_type": "code",
"execution_count": null,
"id": "c5b38706",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.624580Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.624177Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.814238Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.808746Z"
+ }
+ },
"outputs": [],
"source": [
"ax = plot(gam, 0)"
@@ -109,7 +137,14 @@
"cell_type": "code",
"execution_count": null,
"id": "e4d2b6f0",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.825281Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.824327Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.897739Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.897447Z"
+ }
+ },
"outputs": [],
"source": [
"ax.scatter(X[:,0], \n",
@@ -131,7 +166,14 @@
"cell_type": "code",
"execution_count": null,
"id": "82374baa",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.899404Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.899288Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.916570Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.915079Z"
+ }
+ },
"outputs": [],
"source": [
"[degrees_of_freedom(X,\n",
@@ -153,7 +195,14 @@
"cell_type": "code",
"execution_count": null,
"id": "0576d1f3",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.924539Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.924174Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.955630Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.954722Z"
+ }
+ },
"outputs": [],
"source": [
"lam_vals = [approx_lam(X,\n",
@@ -174,7 +223,14 @@
"cell_type": "code",
"execution_count": null,
"id": "3a8b546e",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.961056Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.960521Z",
+ "iopub.status.idle": "2023-07-26T12:46:48.989331Z",
+ "shell.execute_reply": "2023-07-26T12:46:48.987244Z"
+ }
+ },
"outputs": [],
"source": [
"fixed_terms = [s(f, lam=l) for \n",
@@ -189,7 +245,14 @@
"cell_type": "code",
"execution_count": null,
"id": "f2cfbea2",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:48.995461Z",
+ "iopub.status.busy": "2023-07-26T12:46:48.994945Z",
+ "iopub.status.idle": "2023-07-26T12:46:49.130069Z",
+ "shell.execute_reply": "2023-07-26T12:46:49.129127Z"
+ }
+ },
"outputs": [],
"source": [
"ax = plot(fixed_gam, 0)\n",
@@ -210,6 +273,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/helpers/survival.ipynb b/docs/source/helpers/survival.ipynb
index 7cb30a3..f90123e 100644
--- a/docs/source/helpers/survival.ipynb
+++ b/docs/source/helpers/survival.ipynb
@@ -15,7 +15,14 @@
"cell_type": "code",
"execution_count": null,
"id": "0932cabc",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:45.058072Z",
+ "iopub.status.busy": "2023-07-26T12:46:45.057742Z",
+ "iopub.status.idle": "2023-07-26T12:46:45.657730Z",
+ "shell.execute_reply": "2023-07-26T12:46:45.657332Z"
+ }
+ },
"outputs": [],
"source": [
"import numpy as np\n",
@@ -40,7 +47,14 @@
"cell_type": "code",
"execution_count": null,
"id": "d82896bb",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:45.659634Z",
+ "iopub.status.busy": "2023-07-26T12:46:45.659493Z",
+ "iopub.status.idle": "2023-07-26T12:46:45.661327Z",
+ "shell.execute_reply": "2023-07-26T12:46:45.661109Z"
+ }
+ },
"outputs": [],
"source": [
"cum_haz = lambda t: t\n",
@@ -51,7 +65,14 @@
"cell_type": "code",
"execution_count": null,
"id": "c9f9d590",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:45.662631Z",
+ "iopub.status.busy": "2023-07-26T12:46:45.662534Z",
+ "iopub.status.idle": "2023-07-26T12:46:45.672267Z",
+ "shell.execute_reply": "2023-07-26T12:46:45.672017Z"
+ }
+ },
"outputs": [],
"source": [
"T = np.array([sim_time(np.log(2), cum_haz, rng) for _ in range(500)])"
@@ -69,7 +90,14 @@
"cell_type": "code",
"execution_count": null,
"id": "2d8478dc",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:45.673768Z",
+ "iopub.status.busy": "2023-07-26T12:46:45.673685Z",
+ "iopub.status.idle": "2023-07-26T12:46:45.934676Z",
+ "shell.execute_reply": "2023-07-26T12:46:45.934321Z"
+ }
+ },
"outputs": [],
"source": [
"kmf = KaplanMeierFitter(label=\"Simulated data\")\n",
@@ -111,6 +139,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/helpers/svm.ipynb b/docs/source/helpers/svm.ipynb
index 593d840..eb950b5 100644
--- a/docs/source/helpers/svm.ipynb
+++ b/docs/source/helpers/svm.ipynb
@@ -14,7 +14,14 @@
"cell_type": "code",
"execution_count": null,
"id": "2746a357",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:51.026740Z",
+ "iopub.status.busy": "2023-07-26T12:46:51.026289Z",
+ "iopub.status.idle": "2023-07-26T12:46:51.779743Z",
+ "shell.execute_reply": "2023-07-26T12:46:51.779280Z"
+ }
+ },
"outputs": [],
"source": [
"import numpy as np\n",
@@ -34,7 +41,14 @@
"cell_type": "code",
"execution_count": null,
"id": "4728535b",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:51.781697Z",
+ "iopub.status.busy": "2023-07-26T12:46:51.781546Z",
+ "iopub.status.idle": "2023-07-26T12:46:51.783810Z",
+ "shell.execute_reply": "2023-07-26T12:46:51.783514Z"
+ }
+ },
"outputs": [],
"source": [
"rng = np.random.default_rng(1)\n",
@@ -56,7 +70,14 @@
"cell_type": "code",
"execution_count": null,
"id": "74da6860",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:51.785373Z",
+ "iopub.status.busy": "2023-07-26T12:46:51.785272Z",
+ "iopub.status.idle": "2023-07-26T12:46:51.789605Z",
+ "shell.execute_reply": "2023-07-26T12:46:51.789351Z"
+ }
+ },
"outputs": [],
"source": [
"svm = SVC(kernel='linear')\n",
@@ -67,7 +88,14 @@
"cell_type": "code",
"execution_count": null,
"id": "d87b6f75",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:51.790987Z",
+ "iopub.status.busy": "2023-07-26T12:46:51.790907Z",
+ "iopub.status.idle": "2023-07-26T12:46:51.883284Z",
+ "shell.execute_reply": "2023-07-26T12:46:51.882993Z"
+ }
+ },
"outputs": [],
"source": [
"plot(X, Y, svm)"
@@ -89,7 +117,14 @@
"cell_type": "code",
"execution_count": null,
"id": "bc58956a",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-26T12:46:51.884984Z",
+ "iopub.status.busy": "2023-07-26T12:46:51.884867Z",
+ "iopub.status.idle": "2023-07-26T12:46:52.011375Z",
+ "shell.execute_reply": "2023-07-26T12:46:52.011081Z"
+ }
+ },
"outputs": [],
"source": [
"plot(X, Y, svm, features=(3, 4))"
@@ -106,6 +141,18 @@
"display_name": "python3",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/imdb.ipynb b/docs/source/imdb.ipynb
index 1718a58..d9ba5cb 100644
--- a/docs/source/imdb.ipynb
+++ b/docs/source/imdb.ipynb
@@ -5,71 +5,109 @@
"id": "50f2b809",
"metadata": {},
"source": [
- "# Creating a clean IMDB dataset\n",
+ "# Creating IMDB dataset from `keras` version\n",
+ "\n",
+ "This script details how the `IMDB` data in `ISLP` was constructed.\n",
"\n",
"Running this example requires `keras`. Use `pip install keras` to install if necessary."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "d920bb2e",
"metadata": {},
"outputs": [],
"source": [
- "import pickle"
+ "import pickle\n",
+ "import numpy as np\n",
+ "from scipy.sparse import coo_matrix, save_npz\n",
+ "import torch\n",
+ "from keras.datasets import imdb\n",
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "e507f1fb",
+ "cell_type": "markdown",
+ "id": "eaf27f0c-0cb0-4ad5-8775-d138e3f20933",
"metadata": {},
- "outputs": [],
"source": [
- "import numpy as np\n",
- "from scipy.sparse import coo_matrix, save_npz\n",
- "import torch"
+ "We first load the data using `keras`, limiting focus to the 10000 most commmon words."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b94d3f35",
+ "execution_count": 2,
+ "id": "29f0e01e",
"metadata": {},
"outputs": [],
"source": [
- "from keras.datasets import imdb\n",
- "from tensorflow.keras.preprocessing.sequence import pad_sequences"
+ "# the 3 is for three terms: \n",
+ "num_words = 10000+3\n",
+ "((S_train, L_train), \n",
+ " (S_test, L_test)) = imdb.load_data(num_words=num_words)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9020ab27-cc62-4b86-85ba-80a94ff692de",
+ "metadata": {},
+ "source": [
+ "The object `S_train` is effectively a list in which each document has been encoded into a sequence of\n",
+ "values from 0 to 10002."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "29f0e01e",
+ "execution_count": 3,
+ "id": "e27564c4-320f-42b6-9f2e-2a2afdebefcf",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# the 3 is for three terms: \n",
- "num_words = 10000+3\n",
- "((S_train, Y_train), \n",
- " (S_test, Y_test)) = imdb.load_data(num_words=num_words)"
+ "S_train[0][:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15f039fe-faed-4884-a725-1c51d6c8d4d4",
+ "metadata": {},
+ "source": [
+ "We'll use `np.float32` as that is the common precision used in `torch`."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "6cc3c3cb",
"metadata": {},
"outputs": [],
"source": [
- "Y_train = Y_train.astype(np.float32)\n",
- "Y_test = Y_test.astype(np.float32)"
+ "L_train = L_train.astype(np.float32)\n",
+ "L_test = L_test.astype(np.float32)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "005679bc-4337-4757-831e-f9a6ea50f6aa",
+ "metadata": {},
+ "source": [
+ "We will use a one-hot encoding that captures whether or not a given word appears in a given review."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"id": "7b6d1098",
"metadata": {},
"outputs": [],
@@ -88,18 +126,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "afcdc8b2",
"metadata": {},
"outputs": [],
"source": [
- "X_train, L_train = one_hot(S_train, num_words), Y_train\n",
+ "X_train = one_hot(S_train, num_words)\n",
"X_test = one_hot(S_test, num_words)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "a67e299d-8774-4758-8953-77afdce775ab",
+ "metadata": {},
+ "source": [
+ "## Store as sparse tensors\n",
+ "\n",
+ "We see later in the lab that the dense representation is faster. Nevertheless,\n",
+ "let's store the one-hot representation as sparse `torch` tensors \n",
+ "as well as sparse `scipy` matrices."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "b19366ea",
"metadata": {},
"outputs": [],
@@ -115,7 +165,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "b45ae6d1",
"metadata": {},
"outputs": [],
@@ -126,7 +176,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "a47d6eb6",
"metadata": {},
"outputs": [],
@@ -137,7 +187,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "d1b37b37",
"metadata": {},
"outputs": [],
@@ -151,12 +201,12 @@
"id": "1119823a",
"metadata": {},
"source": [
- "save the sparse matrices"
+ "### Save as sparse `scipy` matrices"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "6cb6bfdf",
"metadata": {},
"outputs": [],
@@ -167,12 +217,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "eac1c2ae",
"metadata": {},
"outputs": [],
"source": [
- "np.save('IMDB_Y_test.npy', Y_test)\n",
+ "np.save('IMDB_Y_test.npy', L_test)\n",
"np.save('IMDB_Y_train.npy', L_train)"
]
},
@@ -181,12 +231,14 @@
"id": "25c128e3",
"metadata": {},
"source": [
- "save and pickle the word index"
+ "## Save and pickle the word index\n",
+ "\n",
+ "We'll also want to store a lookup table to convert representations such as `S_train[0]` into words"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "8458bf67",
"metadata": {},
"outputs": [],
@@ -199,9 +251,46 @@
"lookup[4] = \"\""
]
},
+ {
+ "cell_type": "markdown",
+ "id": "5e62ebff-2575-4d35-b46c-51c6f7598efc",
+ "metadata": {},
+ "source": [
+ "Let's look at our first training document:"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
+ "id": "2aaefdf8-0a49-4bdb-8b40-55665283c8a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\" this film was just brilliant casting location scenery story direction everyone's really suited part they played and you\""
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "' '.join([lookup[i] for i in S_train[0][:20]])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e985a73-bfd9-42bd-a523-3dc6e223d602",
+ "metadata": {},
+ "source": [
+ "We save this lookup table so it can be loaded later "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
"id": "d95252de",
"metadata": {},
"outputs": [],
@@ -214,12 +303,15 @@
"id": "b3d900b9",
"metadata": {},
"source": [
- "create the padded representations"
+ "## Padded representations\n",
+ "\n",
+ "For some of the recurrent models, we'll need sequences of common lengths, padded if necessary.\n",
+ "Here, we pad up to a maximum length of 500, filling the remaining entries with 0."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "637b3c5e",
"metadata": {},
"outputs": [],
@@ -230,9 +322,17 @@
" S_test]]"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "a6218300-b355-44cc-b7fb-4bff81211aa6",
+ "metadata": {},
+ "source": [
+ "Finally, we save these for later use in the deep learning lab."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"id": "bac69f88",
"metadata": {},
"outputs": [],
@@ -245,13 +345,24 @@
"metadata": {
"jupytext": {
"cell_metadata_filter": "-all",
- "formats": "source///ipynb,jupyterbook///md:myst,jupyterbook///ipynb",
- "main_language": "python"
+ "formats": "md,ipynb"
},
"kernelspec": {
- "display_name": "python3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.17"
}
},
"nbformat": 4,
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2c80bdc..44b40fc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,8 +3,7 @@ Welcome to ISLP documentation!
.. automodule:: ISLP
-Check out the :doc:`installation` section for further information.
-
+See the :doc:`api/index`
Contents
--------
@@ -16,5 +15,6 @@ Contents
transforms
models
helpers
+ labs
imdb
- api/index
+
diff --git a/docs/source/installation.myst b/docs/source/installation.myst
new file mode 100644
index 0000000..5fba989
--- /dev/null
+++ b/docs/source/installation.myst
@@ -0,0 +1,119 @@
+---
+file_format: mystnb
+kernelspec:
+ name: python3
+ display_name: python3
+---
+
+
+# Install instructions
+
+We generally recommend creating a [conda](https://anaconda.org) environment to isolate any code
+from other dependencies. The `ISLP` package does not have unusual dependencies, but this is still
+good practice.
+
+## Mac OS X / Linux
+
+To create a Python conda environment in a Mac OS X or Linux environment run:
+
+```{code-cell} ipython3
+---
+tags: [skip-execution]
+---
+conda create --name islp python
+```
+
+Current conda should have this at least 3.9. If not, replace `python`
+with `python=3.10`, `python=3.11` or `python=3.12`.
+To run python
+code in this environment, you must activate it:
+
+```{code-cell} ipython3
+---
+tags: [skip-execution]
+---
+conda activate islp
+```
+
+## Windows
+
+On windows, create a `Python` environment called `islp` in the Anaconda app. This can be done by selecting `Environments` on the left hand side of the app's screen. After creating the environment, open a terminal within that environment by clicking on the "Play" button.
+
+# Installing `ISLP`
+
+Having completed the steps above, we use `pip` to install the `ISLP` package:
+
+```{code-cell} ipython3
+---
+tags: [skip-execution]
+---
+pip install ISLP
+```
+
+## Frozen environment
+
+```{attention}
+
+Python packages change frequently. The labs here are built
+with {{ ISLP_lab_link }}. Visit the lab git repo for specific instructions
+to install the frozen environment.
+```
+
+## Torch requirements
+
+The `ISLP` labs use `torch` and various related packages for the lab
+on deep learning. Most of the requirements are included in the requirements for `ISLP` though the labs
+also use `torchinfo` and `torchvision`. These will be installed by the `requirements.txt` above.
+
+```{attention}
+Because
+`torch` and related libraries change frequently, you will note that we
+have pinned the versions at specific versions that were used to make
+current verisons of the labs.
+```
+
+## Jupyter
+
+```{attention}
+If using the Anaconda App, `jupyter` can be installed with a GUI. Use
+the GUI install instead of the `pip` install below.
+```
+
+### Mac OS X
+
+```{attention}
+
+If you are using the Anaconda GUI, it is recommended that you install JupyterLab through the GUI
+and skip the step below. Installing both through the GUI and `pip` may result in conflicts and
+a broken JupyterLab.
+
+If you have installed JupyterLab in your environment via the GUI, the above call `pip install ISLP` may be made within
+any running notebook within that environment.
+```
+
+If JupyterLab is not already installed, run the following after having activated your `islp` environment:
+
+```{code-cell} ipython3
+---
+tags: [skip-execution]
+---
+pip install jupyterlab
+```
+
+### Windows
+
+Either use the same `pip` command above or install JupyterLab from the
+`Home` tab. Ensure that the environment is your `islp`
+environment. This information appears near the top left in the
+Anaconda `Home` page.
+
+# Google Colab
+
+The notebooks for the labs can be run in [Google
+Colab](https://colab.research.google.com) with a few caveats:
+
+- Labs that use files in the filesystem will require one to mount your
+ Google Drive. See Google's [help](https://colab.research.google.com/notebooks/io.ipynb).
+
+- The packages will have to be reinstalled each time a new runtime is started.
+For most labs, inserting `pip install ISLP` at the top of the notebook will suffice, though Colab will ask you to restart after installation.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
deleted file mode 100644
index 981b1ae..0000000
--- a/docs/source/installation.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-Usage
-=====
-
-.. _installation:
-
-Installation
-------------
-
-To use ISLP, first install it using pip:
-
-.. code-block:: console
-
- (.venv) $ pip install ISLP
-
-Creating recipes
-----------------
-
-BLAH
diff --git a/docs/source/labs.myst b/docs/source/labs.myst
new file mode 100644
index 0000000..b33bd3d
--- /dev/null
+++ b/docs/source/labs.myst
@@ -0,0 +1,58 @@
+---
+file_format: mystnb
+kernelspec:
+ name: python3
+ display_name: python3
+myst_number_code_blocks: python
+---
+
+# Labs
+
+{{ ISLP_binder_code }}
+
+The current version of the labs for `ISLP` are included here.
+
+
+## Package versions
+
+
+```{attention}
+
+Python packages change frequently. The labs here are built
+with {{ ISLP_lab_link }}. Visit the lab git repo for specific instructions
+to install the frozen environment.
+
+
+A zip file containig all the labs and data files can be downloaded
+here {{ ISLP_zip_link }}.
+
+```
+
+```{warning}
+The version of the `ISLP` library used to build these labs
+may differ slightly from the one documented here.
+The labs are built with {{ ISLP_lab_version }}.
+
+The [Binder](http://mybinder.org) link above will run {{ ISLP_lab_link }} with
+library version {{ ISLP_lab_version }}.
+
+```
+
+
+```{toctree}
+maxdepth: 1
+
+labs/Ch02-statlearn-lab
+labs/Ch03-linreg-lab
+labs/Ch04-classification-lab
+labs/Ch05-resample-lab
+labs/Ch06-varselect-lab
+labs/Ch07-nonlin-lab
+labs/Ch08-baggboost-lab
+labs/Ch09-svm-lab
+labs/Ch10-deeplearning-lab
+labs/Ch11-surv-lab
+labs/Ch12-unsup-lab
+labs/Ch13-multiple-lab
+```
+
diff --git a/docs/source/logo.png b/docs/source/logo.png
new file mode 100644
index 0000000..237c1cd
Binary files /dev/null and b/docs/source/logo.png differ
diff --git a/docs/source/models.rst b/docs/source/models.rst
index b34581f..5f9e5c8 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -4,8 +4,8 @@ Tools for regression models
.. toctree::
models/spec
- models/derived
- models/submodels
models/selection
+ models/anova
+
diff --git a/docs/source/models/anova.ipynb b/docs/source/models/anova.ipynb
new file mode 100644
index 0000000..41e8bcb
--- /dev/null
+++ b/docs/source/models/anova.ipynb
@@ -0,0 +1,648 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ee33d364",
+ "metadata": {},
+ "source": [
+ "# ANOVA using `ModelSpec`\n",
+ "\n",
+ "\n",
+ "In this lab we illustrate how to run create specific ANOVA analyses\n",
+ "using `ModelSpec`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4c70fbaa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from statsmodels.api import OLS\n",
+ "from statsmodels.stats.anova import anova_lm\n",
+ "\n",
+ "from ISLP import load_data\n",
+ "from ISLP.models import (ModelSpec,\n",
+ " derived_feature,\n",
+ " summarize)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "333a49cf",
+ "metadata": {},
+ "source": [
+ "We will carry out two simple ANOVA analyses of the `Hitters` data.\n",
+ "We wish to predict a baseball player’s `Salary` on the\n",
+ "basis of various statistics associated with performance in the\n",
+ "previous year."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8a708215",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "59"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Hitters = load_data('Hitters')\n",
+ "np.isnan(Hitters['Salary']).sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dad5e991",
+ "metadata": {},
+ "source": [
+ " \n",
+ " We see that `Salary` is missing for 59 players. The\n",
+ "`dropna()` method of data frames removes all of the rows that have missing\n",
+ "values in any variable (by default --- see `Hitters.dropna?`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "ac7086a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n",
+ " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n",
+ " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Hitters = Hitters.dropna()\n",
+ "Hitters.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a0a3521-be74-40df-a404-3895d80a11dc",
+ "metadata": {},
+ "source": [
+ "## Grouping variables\n",
+ "\n",
+ "A look at the [description](https://islp.readthedocs.io/en/latest/datasets/Hitters.html) of the data shows\n",
+ "that there are both career and 1986 offensive stats, as well as some defensive stats.\n",
+ "\n",
+ "Let's group the offensive into recent and career offensive stats, as well as a group of defensive variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a215e43b-7bc8-4bdd-91cf-40d717cd7978",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "confounders = derived_feature(['Division', 'League', 'NewLeague'],\n",
+ " name='confounders')\n",
+ "offense_career = derived_feature(['CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks'],\n",
+ " name='offense_career')\n",
+ "offense_1986 = derived_feature(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks'],\n",
+ " name='offense_1986')\n",
+ "defense_1986 = derived_feature(['PutOuts', 'Assists', 'Errors'],\n",
+ " name='defense_1986')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa15fd0c-1e8a-431e-8425-c61da8439976",
+ "metadata": {},
+ "source": [
+ "We'll first do a sequential ANOVA where terms are added sequentially"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "40cd6c28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "design = ModelSpec([confounders, offense_career, defense_1986, offense_1986]).fit(Hitters)\n",
+ "Y = np.array(Hitters['Salary'])\n",
+ "X = design.transform(Hitters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "074120b1",
+ "metadata": {},
+ "source": [
+ "Along with a score we need to specify the search strategy. This is done through the object\n",
+ "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n",
+ "runs forward stepwise until any further additions to the model do not result\n",
+ "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n",
+ "runs a fixed number of steps of stepwise search."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e65f5607",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " coef | \n",
+ " std err | \n",
+ " t | \n",
+ " P>|t| | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | intercept | \n",
+ " 148.2187 | \n",
+ " 73.595 | \n",
+ " 2.014 | \n",
+ " 0.045 | \n",
+ "
\n",
+ " \n",
+ " | Division[W] | \n",
+ " -116.0404 | \n",
+ " 40.188 | \n",
+ " -2.887 | \n",
+ " 0.004 | \n",
+ "
\n",
+ " \n",
+ " | League[N] | \n",
+ " 63.7503 | \n",
+ " 79.006 | \n",
+ " 0.807 | \n",
+ " 0.421 | \n",
+ "
\n",
+ " \n",
+ " | NewLeague[N] | \n",
+ " -24.3989 | \n",
+ " 78.843 | \n",
+ " -0.309 | \n",
+ " 0.757 | \n",
+ "
\n",
+ " \n",
+ " | CAtBat | \n",
+ " -0.1887 | \n",
+ " 0.120 | \n",
+ " -1.572 | \n",
+ " 0.117 | \n",
+ "
\n",
+ " \n",
+ " | CHits | \n",
+ " 0.1636 | \n",
+ " 0.665 | \n",
+ " 0.246 | \n",
+ " 0.806 | \n",
+ "
\n",
+ " \n",
+ " | CHmRun | \n",
+ " -0.1517 | \n",
+ " 1.612 | \n",
+ " -0.094 | \n",
+ " 0.925 | \n",
+ "
\n",
+ " \n",
+ " | CRuns | \n",
+ " 1.4716 | \n",
+ " 0.747 | \n",
+ " 1.971 | \n",
+ " 0.050 | \n",
+ "
\n",
+ " \n",
+ " | CRBI | \n",
+ " 0.8021 | \n",
+ " 0.691 | \n",
+ " 1.161 | \n",
+ " 0.247 | \n",
+ "
\n",
+ " \n",
+ " | CWalks | \n",
+ " -0.8124 | \n",
+ " 0.327 | \n",
+ " -2.481 | \n",
+ " 0.014 | \n",
+ "
\n",
+ " \n",
+ " | PutOuts | \n",
+ " 0.2827 | \n",
+ " 0.077 | \n",
+ " 3.661 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " | Assists | \n",
+ " 0.3755 | \n",
+ " 0.220 | \n",
+ " 1.705 | \n",
+ " 0.089 | \n",
+ "
\n",
+ " \n",
+ " | Errors | \n",
+ " -3.2940 | \n",
+ " 4.377 | \n",
+ " -0.753 | \n",
+ " 0.452 | \n",
+ "
\n",
+ " \n",
+ " | AtBat | \n",
+ " -1.9509 | \n",
+ " 0.624 | \n",
+ " -3.125 | \n",
+ " 0.002 | \n",
+ "
\n",
+ " \n",
+ " | Hits | \n",
+ " 7.4395 | \n",
+ " 2.363 | \n",
+ " 3.148 | \n",
+ " 0.002 | \n",
+ "
\n",
+ " \n",
+ " | HmRun | \n",
+ " 4.3449 | \n",
+ " 6.190 | \n",
+ " 0.702 | \n",
+ " 0.483 | \n",
+ "
\n",
+ " \n",
+ " | Runs | \n",
+ " -2.3312 | \n",
+ " 2.971 | \n",
+ " -0.785 | \n",
+ " 0.433 | \n",
+ "
\n",
+ " \n",
+ " | RBI | \n",
+ " -1.0670 | \n",
+ " 2.595 | \n",
+ " -0.411 | \n",
+ " 0.681 | \n",
+ "
\n",
+ " \n",
+ " | Walks | \n",
+ " 6.2196 | \n",
+ " 1.825 | \n",
+ " 3.409 | \n",
+ " 0.001 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " coef std err t P>|t|\n",
+ "intercept 148.2187 73.595 2.014 0.045\n",
+ "Division[W] -116.0404 40.188 -2.887 0.004\n",
+ "League[N] 63.7503 79.006 0.807 0.421\n",
+ "NewLeague[N] -24.3989 78.843 -0.309 0.757\n",
+ "CAtBat -0.1887 0.120 -1.572 0.117\n",
+ "CHits 0.1636 0.665 0.246 0.806\n",
+ "CHmRun -0.1517 1.612 -0.094 0.925\n",
+ "CRuns 1.4716 0.747 1.971 0.050\n",
+ "CRBI 0.8021 0.691 1.161 0.247\n",
+ "CWalks -0.8124 0.327 -2.481 0.014\n",
+ "PutOuts 0.2827 0.077 3.661 0.000\n",
+ "Assists 0.3755 0.220 1.705 0.089\n",
+ "Errors -3.2940 4.377 -0.753 0.452\n",
+ "AtBat -1.9509 0.624 -3.125 0.002\n",
+ "Hits 7.4395 2.363 3.148 0.002\n",
+ "HmRun 4.3449 6.190 0.702 0.483\n",
+ "Runs -2.3312 2.971 -0.785 0.433\n",
+ "RBI -1.0670 2.595 -0.411 0.681\n",
+ "Walks 6.2196 1.825 3.409 0.001"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "M = OLS(Y, X).fit()\n",
+ "summarize(M)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29d9b55f",
+ "metadata": {},
+ "source": [
+ "We'll first produce the sequential, or Type I ANOVA results. This builds up a model sequentially and compares\n",
+ "two successive models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "cfbe5b92",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " df_resid | \n",
+ " ssr | \n",
+ " df_diff | \n",
+ " ss_diff | \n",
+ " F | \n",
+ " Pr(>F) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | intercept | \n",
+ " 262.0 | \n",
+ " 5.331911e+07 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | confounders | \n",
+ " 259.0 | \n",
+ " 5.131263e+07 | \n",
+ " 3.0 | \n",
+ " 2.006478e+06 | \n",
+ " 6.741147 | \n",
+ " 2.144265e-04 | \n",
+ "
\n",
+ " \n",
+ " | offense_career | \n",
+ " 253.0 | \n",
+ " 3.059130e+07 | \n",
+ " 6.0 | \n",
+ " 2.072134e+07 | \n",
+ " 34.808656 | \n",
+ " 1.470455e-30 | \n",
+ "
\n",
+ " \n",
+ " | defense_1986 | \n",
+ " 250.0 | \n",
+ " 2.730614e+07 | \n",
+ " 3.0 | \n",
+ " 3.285156e+06 | \n",
+ " 11.037111 | \n",
+ " 7.880207e-07 | \n",
+ "
\n",
+ " \n",
+ " | offense_1986 | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 6.0 | \n",
+ " 3.097572e+06 | \n",
+ " 5.203444 | \n",
+ " 4.648586e-05 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " df_resid ssr df_diff ss_diff F \\\n",
+ "intercept 262.0 5.331911e+07 0.0 NaN NaN \n",
+ "confounders 259.0 5.131263e+07 3.0 2.006478e+06 6.741147 \n",
+ "offense_career 253.0 3.059130e+07 6.0 2.072134e+07 34.808656 \n",
+ "defense_1986 250.0 2.730614e+07 3.0 3.285156e+06 11.037111 \n",
+ "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n",
+ "\n",
+ " Pr(>F) \n",
+ "intercept NaN \n",
+ "confounders 2.144265e-04 \n",
+ "offense_career 1.470455e-30 \n",
+ "defense_1986 7.880207e-07 \n",
+ "offense_1986 4.648586e-05 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = anova_lm(*[OLS(Y, D).fit() for D in design.build_sequence(Hitters, anova_type='sequential')])\n",
+ "df.index = design.names\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7092f666",
+ "metadata": {},
+ "source": [
+ "We can similarly compute the Type II ANOVA results which drops each term and compares to the full model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "e2d43844",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " df_resid | \n",
+ " ssr | \n",
+ " df_diff | \n",
+ " ss_diff | \n",
+ " F | \n",
+ " Pr(>F) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | intercept | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 1.0 | \n",
+ " 4.024254e+05 | \n",
+ " 4.056076 | \n",
+ " 4.511037e-02 | \n",
+ "
\n",
+ " \n",
+ " | confounders | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 3.0 | \n",
+ " 9.661738e+05 | \n",
+ " 3.246046 | \n",
+ " 2.261572e-02 | \n",
+ "
\n",
+ " \n",
+ " | offense_career | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 6.0 | \n",
+ " 1.051025e+07 | \n",
+ " 17.655596 | \n",
+ " 5.701196e-17 | \n",
+ "
\n",
+ " \n",
+ " | defense_1986 | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 3.0 | \n",
+ " 1.467933e+06 | \n",
+ " 4.931803 | \n",
+ " 2.415732e-03 | \n",
+ "
\n",
+ " \n",
+ " | offense_1986 | \n",
+ " 244.0 | \n",
+ " 2.420857e+07 | \n",
+ " 6.0 | \n",
+ " 3.097572e+06 | \n",
+ " 5.203444 | \n",
+ " 4.648586e-05 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " df_resid ssr df_diff ss_diff F \\\n",
+ "intercept 244.0 2.420857e+07 1.0 4.024254e+05 4.056076 \n",
+ "confounders 244.0 2.420857e+07 3.0 9.661738e+05 3.246046 \n",
+ "offense_career 244.0 2.420857e+07 6.0 1.051025e+07 17.655596 \n",
+ "defense_1986 244.0 2.420857e+07 3.0 1.467933e+06 4.931803 \n",
+ "offense_1986 244.0 2.420857e+07 6.0 3.097572e+06 5.203444 \n",
+ "\n",
+ " Pr(>F) \n",
+ "intercept 4.511037e-02 \n",
+ "confounders 2.261572e-02 \n",
+ "offense_career 5.701196e-17 \n",
+ "defense_1986 2.415732e-03 \n",
+ "offense_1986 4.648586e-05 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "D_full = design.transform(Hitters)\n",
+ "OLS_full = OLS(Y, D_full).fit()\n",
+ "dfs = []\n",
+ "for d in design.build_sequence(Hitters, anova_type='drop'):\n",
+ " dfs.append(anova_lm(OLS(Y,d).fit(), OLS_full).iloc[1:])\n",
+ "df = pd.concat(dfs)\n",
+ "df.index = design.names\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "362709ae-9558-4c4c-8f5e-f8388caf631d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
+ },
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/models/derived.ipynb b/docs/source/models/derived.ipynb
deleted file mode 100644
index cc1b0ac..0000000
--- a/docs/source/models/derived.ipynb
+++ /dev/null
@@ -1,2125 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "38217f02",
- "metadata": {},
- "source": [
- "# Building design matrices with `ModelSpec`\n",
- "\n",
- "Force rebuild"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "3107d1f9",
- "metadata": {},
- "outputs": [],
- "source": [
- "x=4\n",
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
- "\n",
- "from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
- "\n",
- "import statsmodels.api as sm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "cdc46a4e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n",
- " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
- "Carseats.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e0a2a83a",
- "metadata": {},
- "source": [
- "## Let's break up income into groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "68b40caf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "35558d88",
- "metadata": {},
- "source": [
- "Let's also create an unordered version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "e5e81a95",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4bbf9e13",
- "metadata": {},
- "source": [
- "## A simple model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "1ad729b3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "d05e9ec8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b4e9ee33",
- "metadata": {},
- "source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "64ac65d3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "620f0e01",
- "metadata": {},
- "source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "77b898e0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4580a6bf",
- "metadata": {},
- "source": [
- "It recognized ordinal columns as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "c2dab855",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['OIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "5e7963d6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6b689966",
- "metadata": {},
- "source": [
- "## Encoding a column\n",
- "\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "ff3b96b6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "7e87da20",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4f2030ac",
- "metadata": {},
- "source": [
- "## The terms\n",
- "\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "27fc4fb3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Price', 'Income']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "16316981",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ef3f2bd0",
- "metadata": {},
- "source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "dd9c7fa6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5fc4cc45",
- "metadata": {},
- "source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "49d7fb46",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bdfc0fe9",
- "metadata": {},
- "source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "cf6f3f4c",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1552d19a",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "12d955dd",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f5ea292d",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "ae2af29b",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "57305dbe",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "89656ec4",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f6cb8167",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "547cb625",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "ff5b41d5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " Price | \n",
- " Income | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "932759cf",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "e2190b00",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "6545c5da",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cd088b51",
- "metadata": {},
- "source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "8f37ae20",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "184aefc2",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "e4134980",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53808f3b",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "62059c57",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ded12f69",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "fbb509d1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f01391e4",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "10df55ae",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b43099fb",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "50bce64d",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "2eb2ff16",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "6686dff8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "0e0eafd7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "43cce209",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "99bf408e",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "11c19ebf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4b48e5d2",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "81f641ba",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "79f7eb4d",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "2afb3b5d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "c44692ab",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "c0bfb2a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "d263056c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "edf0dc68",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "82071a54",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "cd18a4a4",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "229fa32d",
- "metadata": {},
- "source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "b8c52dbb",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e7f93464",
- "metadata": {},
- "source": [
- "Comparing this to the previous `R` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "4094c01f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d448c9ca",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "634e05c6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "4c09c93f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "dtype: float64"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "48c1989f",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "85a28d87",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3)', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "e17c8a9d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3)[0] 10.036373\n",
- "poly(Income, 3)[1] -2.799156\n",
- "poly(Income, 3)[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "944f56d6",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "1889caca",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bd4dca31",
- "metadata": {},
- "source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "70fae990",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import ns, bs, pca"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2d812694",
- "metadata": {},
- "source": [
- "## Custom encoding\n",
- "\n",
- "Instead of PCA we might run some clustering on some features and then uses the clusters to\n",
- "create new features. This can be done with `derived_variable`. Indeed, `pca`, `ns` and `bs` are all examples\n",
- "of this."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "8e5d2305",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import derived_variable, Contrast"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "8a40c663",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 0, 2,\n",
- " 2, 2, 1, 2, 1, 0, 0, 1, 0, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2,\n",
- " 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 0, 1, 2,\n",
- " 0, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, 2, 2, 0, 1, 2,\n",
- " 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 2, 0, 0, 1, 2, 0, 1, 0,\n",
- " 0, 1, 0, 2, 0, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0,\n",
- " 0, 0, 2, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 0, 2, 1, 0, 0, 0, 1, 2, 2,\n",
- " 1, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 1, 0, 0, 1, 1, 1, 0, 0, 2, 0,\n",
- " 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 0, 2, 2, 1, 1, 2, 2, 0, 1, 1, 2, 2,\n",
- " 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 1,\n",
- " 1, 2, 0, 2, 2, 2, 2, 0, 1, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 0, 0, 0,\n",
- " 2, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 2, 1, 0, 2, 1, 2, 1, 1, 2, 1, 2,\n",
- " 2, 2, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 2,\n",
- " 1, 2, 2, 1, 1, 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0,\n",
- " 1, 0, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 0, 2, 1, 0, 1,\n",
- " 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 1,\n",
- " 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2, 1, 0, 2, 1, 1, 0, 2, 2,\n",
- " 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1,\n",
- " 2, 2, 0, 2], dtype=int32)"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.cluster import KMeans\n",
- "from sklearn.pipeline import make_pipeline\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "cluster = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n",
- "group = Variable(('Income', 'Price', 'Advertising', 'Population'), 'group', None)\n",
- "X = design.build_submodel(Carseats, [group]).drop('intercept', axis=1)\n",
- "cluster.fit(X.values)\n",
- "cluster.predict(X.values)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9bc38836",
- "metadata": {},
- "source": [
- "For clustering, we often want to use the `predict` method rather than the `transform` method. If the ultimate\n",
- "features all use `transform` then the do not even need to use these two calls to `make_pipeline`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "8ceab9b6",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " myclus | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 1.0 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 1.0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 1.0 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " intercept myclus\n",
- "0 1.0 1\n",
- "1 1.0 1\n",
- "2 1.0 2\n",
- "3 1.0 1\n",
- "4 1.0 2\n",
- ".. ... ...\n",
- "395 1.0 1\n",
- "396 1.0 2\n",
- "397 1.0 2\n",
- "398 1.0 0\n",
- "399 1.0 2\n",
- "\n",
- "[400 rows x 2 columns]"
- ]
- },
- "execution_count": 52,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n",
- "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n",
- " name='myclus', \n",
- " encoder=cluster2,\n",
- " use_transform=False)\n",
- "design = ModelSpec([cluster_var]).fit(Carseats)\n",
- "design.transform(Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1f9b2630",
- "metadata": {},
- "source": [
- "Somewhat clunkily, we can make this a categorical variable by creating a `Variable` with a\n",
- "categorical encoder."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "ffde00a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=(Variable(variables=('Income', 'Price', 'Advertising', 'Population'), name='myclus', encoder=Pipeline(steps=[('standardscaler', StandardScaler()),\n",
- " ('kmeans', KMeans(n_clusters=3, random_state=0))]), use_transform=False, pure_columns=False, override_encoder_colnames=True),), name='mynewcat', encoder=Contrast(), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cluster2 = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=0))\n",
- "cluster_var = derived_variable(['Income', 'Price', 'Advertising', 'Population'], \n",
- " name='myclus', \n",
- " encoder=cluster2,\n",
- " use_transform=False)\n",
- "cat_cluster = Variable((cluster_var,), name='mynewcat', encoder=Contrast(method='drop'))\n",
- "cat_cluster"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "5afeab7c",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 3 columns
\n",
- "
"
- ],
- "text/plain": [
- " intercept 1 2\n",
- "0 1.0 1.0 0.0\n",
- "1 1.0 1.0 0.0\n",
- "2 1.0 0.0 1.0\n",
- "3 1.0 1.0 0.0\n",
- "4 1.0 0.0 1.0\n",
- ".. ... ... ...\n",
- "395 1.0 1.0 0.0\n",
- "396 1.0 0.0 1.0\n",
- "397 1.0 0.0 1.0\n",
- "398 1.0 0.0 0.0\n",
- "399 1.0 0.0 1.0\n",
- "\n",
- "[400 rows x 3 columns]"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([cat_cluster]).fit(Carseats)\n",
- "\n",
- "design.transform(Carseats)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e24d5637-80fb-49bf-ac10-8ff68cb8bd8f",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "jupytext": {
- "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
- },
- "kernelspec": {
- "display_name": "python3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/source/models/selection.ipynb b/docs/source/models/selection.ipynb
index 3a7d002..fd66d95 100644
--- a/docs/source/models/selection.ipynb
+++ b/docs/source/models/selection.ipynb
@@ -2,2723 +2,259 @@
"cells": [
{
"cell_type": "markdown",
- "id": "72bae06a",
+ "id": "247387ec-1477-42e6-9e69-cad1cacb5721",
"metadata": {},
"source": [
- "# Model selection using `ModelSpec`"
+ "# Model selection using `ModelSpec`\n",
+ "\n",
+ "\n",
+ "In this lab we illustrate how to run forward stepwise model selection\n",
+ "using the model specification capability of `ModelSpec`."
]
},
{
"cell_type": "code",
"execution_count": 1,
- "id": "ae6bd850",
+ "id": "4720bb2a-6bec-4e91-a57e-9689aa4f0532",
"metadata": {},
"outputs": [],
"source": [
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
- "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from statsmodels.api import OLS\n",
"from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
- "\n",
- "import statsmodels.api as sm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "5ac10e72",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n",
- " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
- "Carseats.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "80a586d9",
- "metadata": {},
- "source": [
- "## Let's break up income into groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "850356ba",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e24def3a",
- "metadata": {},
- "source": [
- "Let's also create an unordered version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "edf83080",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "aa22bb9c",
- "metadata": {},
- "source": [
- "## A simple model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "38d92522",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "cfc2056f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4674c345",
- "metadata": {},
- "source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "5688f0ad",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['ShelveLoc']"
+ "from ISLP.models import (ModelSpec,\n",
+ " Stepwise,\n",
+ " sklearn_selected)"
]
},
{
"cell_type": "markdown",
- "id": "4ae28ffa",
+ "id": "1c224240-ce8b-47f3-a85a-052c43038b26",
"metadata": {},
"source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
+ "### Forward Selection\n",
+ " \n",
+ "We will apply the forward-selection approach to the `Hitters` \n",
+ "data. We wish to predict a baseball player’s `Salary` on the\n",
+ "basis of various statistics associated with performance in the\n",
+ "previous year."
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "5f8926fd",
+ "execution_count": 2,
+ "id": "2adc66cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
+ "59"
]
},
- "execution_count": 8,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "design.column_info_['ShelveLoc']"
+ "Hitters = load_data('Hitters')\n",
+ "np.isnan(Hitters['Salary']).sum()"
]
},
{
"cell_type": "markdown",
- "id": "966f53a5",
- "metadata": {},
- "source": [
- "It recognized ordinal columns as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "a137fa1e",
+ "id": "40c9a484",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "design.column_info_['OIncome']"
+ " \n",
+ " We see that `Salary` is missing for 59 players. The\n",
+ "`dropna()` method of data frames removes all of the rows that have missing\n",
+ "values in any variable (by default --- see `Hitters.dropna?`)."
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "3390dcb0",
+ "execution_count": 3,
+ "id": "1869fdab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
+ "(263, 20)"
]
},
- "execution_count": 10,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
+ "Hitters = Hitters.dropna()\n",
+ "Hitters.shape"
]
},
{
"cell_type": "markdown",
- "id": "b6667415",
- "metadata": {},
- "source": [
- "## Encoding a column\n",
- "\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "a1b42dbd",
+ "id": "0a1fe9e6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
+ "We first choose the best model using forward selection based on AIC. This score\n",
+ "is not built in as a metric to `sklearn`. We therefore define a function to compute it ourselves, and use\n",
+ "it as a scorer. By default, `sklearn` tries to maximize a score, hence\n",
+ " our scoring function computes the negative AIC statistic."
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "31367988",
+ "execution_count": 4,
+ "id": "76bd8110",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
+ "def negAIC(estimator, X, Y):\n",
+ " \"Negative AIC\"\n",
+ " n, p = X.shape\n",
+ " Yhat = estimator.predict(X)\n",
+ " MSE = np.mean((Y - Yhat)**2)\n",
+ " return n + n * np.log(MSE) + 2 * (p + 1)\n",
+ " "
]
},
{
"cell_type": "markdown",
- "id": "751c1487",
- "metadata": {},
- "source": [
- "## The terms\n",
- "\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "6e2b6155",
+ "id": "14ba6f49",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Price', 'Income']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "design.terms"
+ "We need to estimate the residual variance $\\sigma^2$, which is the first argument in our scoring function above.\n",
+ "We will fit the biggest model, using all the variables, and estimate $\\sigma^2$ based on its MSE."
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "d3e669da",
+ "execution_count": 5,
+ "id": "94e10f35",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "design.terms_"
+ "design = ModelSpec(Hitters.columns.drop('Salary')).fit(Hitters)\n",
+ "Y = np.array(Hitters['Salary'])\n",
+ "X = design.transform(Hitters)"
]
},
{
"cell_type": "markdown",
- "id": "fb0a45c9",
+ "id": "afdda5f2",
"metadata": {},
"source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
+ "Along with a score we need to specify the search strategy. This is done through the object\n",
+ "`Stepwise()` in the `ISLP.models` package. The method `Stepwise.first_peak()`\n",
+ "runs forward stepwise until any further additions to the model do not result\n",
+ "in an improvement in the evaluation score. Similarly, the method `Stepwise.fixed_steps()`\n",
+ "runs a fixed number of steps of stepwise search."
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "554c67cb",
+ "execution_count": 6,
+ "id": "048c8500",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
+ "strategy = Stepwise.first_peak(design,\n",
+ " direction='forward',\n",
+ " max_terms=len(design.terms))"
]
},
{
"cell_type": "markdown",
- "id": "06956a6f",
+ "id": "e0c0af0e",
"metadata": {},
"source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
+ " \n",
+ "We now fit a linear regression model with `Salary` as outcome using forward\n",
+ "selection. To do so, we use the function `sklearn_selected()` from the `ISLP.models` package. This takes\n",
+ "a model from `statsmodels` along with a search strategy and selects a model with its\n",
+ "`fit` method. Without specifying a `scoring` argument, the score defaults to MSE, and so all 19 variables will be\n",
+ "selected."
]
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "dd434884",
+ "execution_count": 7,
+ "id": "26f09fe9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
+ "('Assists',\n",
+ " 'AtBat',\n",
+ " 'CAtBat',\n",
+ " 'CHits',\n",
+ " 'CHmRun',\n",
+ " 'CRBI',\n",
+ " 'CRuns',\n",
+ " 'CWalks',\n",
+ " 'Division',\n",
+ " 'Errors',\n",
+ " 'Hits',\n",
+ " 'HmRun',\n",
+ " 'League',\n",
+ " 'NewLeague',\n",
+ " 'PutOuts',\n",
+ " 'RBI',\n",
+ " 'Runs',\n",
+ " 'Walks',\n",
+ " 'Years')"
]
},
- "execution_count": 16,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
+ "hitters_MSE = sklearn_selected(OLS,\n",
+ " strategy)\n",
+ "hitters_MSE.fit(Hitters, Y)\n",
+ "hitters_MSE.selected_state_"
]
},
{
"cell_type": "markdown",
- "id": "5cdb088c",
+ "id": "4acf4792",
"metadata": {},
"source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
+ " Using `neg_Cp` results in a smaller model, as expected, with just 4variables selected."
]
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "519a642e",
+ "execution_count": 8,
+ "id": "a825f4d8",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
{
"data": {
"text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
+ "('Assists', 'Errors', 'League', 'NewLeague')"
]
},
- "execution_count": 17,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "403921a2",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
+ "hitters_Cp = sklearn_selected(OLS,\n",
+ " strategy,\n",
+ " scoring=negAIC)\n",
+ "hitters_Cp.fit(Hitters, Y)\n",
+ "hitters_Cp.selected_state_"
]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "b422cde1",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53e38f57",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "6347acb6",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "08b5ddb0",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "a8eb3e33",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "97912337",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "72b5e629",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "8a457e3e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " Price | \n",
- " Income | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8624ab8c",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "6052765e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "9158de59",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9608bed3",
- "metadata": {},
- "source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "f0b8120f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "270a02a6",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "4ffbce7e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bc5ff62b",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "34dae1e9",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7e9da262",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "938b9430",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "083e9529",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "d413a9fe",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0f4b508b",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8bcbd973",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "cf13f72e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "c1fa0a90",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "b28aa313",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "aa764acc",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "31876a29",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "bac2643c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1485735d",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "3987c5d6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7a6631c9",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "83a9b94e",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "f0ffabea",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "4a5fdc64",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "ae7e3bd2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "c12ac3df",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "53bf8aef",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "47723bce",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "86060622",
- "metadata": {},
- "source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "d7a2ab9b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2a5e7f6b",
- "metadata": {},
- "source": [
- "Comparing this to the previous `R` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "bbb02036",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "89106a85",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "151f3fee",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "945ce7bc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "dtype: float64"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "450b94dd",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "18d5c1c8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "46c7d911",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3, )[0] 10.036373\n",
- "poly(Income, 3, )[1] -2.799156\n",
- "poly(Income, 3, )[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "99bf13a1",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "7606facd",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a4931031",
- "metadata": {},
- "source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "1c1bf5f3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 4.240421\n",
- "ns(Income, , df=5)[0] 1.468196\n",
- "ns(Income, , df=5)[1] 1.499471\n",
- "ns(Income, , df=5)[2] 1.152070\n",
- "ns(Income, , df=5)[3] 2.418398\n",
- "ns(Income, , df=5)[4] 1.804460\n",
- "ShelveLoc[Good] 4.810449\n",
- "ShelveLoc[Medium] 1.881095\n",
- "dtype: float64"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import ns, bs, pca\n",
- "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "8c24254b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n",
- " 4.240421 1.468196 1.499471 1.152070 \n",
- "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n",
- " 2.418398 1.804460 4.810449 1.881095 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "library(splines)\n",
- "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "f9d6c4a7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 3.495085\n",
- "bs(Income, , df=7, degree=2)[0] 1.813118\n",
- "bs(Income, , df=7, degree=2)[1] 0.961852\n",
- "bs(Income, , df=7, degree=2)[2] 2.471545\n",
- "bs(Income, , df=7, degree=2)[3] 2.158891\n",
- "bs(Income, , df=7, degree=2)[4] 2.091625\n",
- "bs(Income, , df=7, degree=2)[5] 2.600669\n",
- "bs(Income, , df=7, degree=2)[6] 2.843108\n",
- "ShelveLoc[Good] 4.804919\n",
- "ShelveLoc[Medium] 1.880337\n",
- "dtype: float64"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "0bf1726a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) bs(Income, df = 7, degree = 2)1 \n",
- " 3.4950851 1.8131176 \n",
- "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n",
- " 0.9618523 2.4715450 \n",
- "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n",
- " 2.1588908 2.0916252 \n",
- "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n",
- " 2.6006694 2.8431084 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.8049190 1.8803375 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "914df4cf",
- "metadata": {},
- "source": [
- "## PCA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "cc22e780",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.419405\n",
- "pca(myvars, , n_components=2)[0] -0.001131\n",
- "pca(myvars, , n_components=2)[1] -0.024217\n",
- "ShelveLoc[Good] 4.816253\n",
- "ShelveLoc[Medium] 1.924139\n",
- "dtype: float64"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars'), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "de571e61",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.419405 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n",
- " 0.001131 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n",
- " -0.024217 \n",
- " ShelveLocGood \n",
- " 4.816253 \n",
- " ShelveLocMedium \n",
- " 1.924139 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0a103b5a",
- "metadata": {},
- "source": [
- "It is of course common to scale before running PCA."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "id": "95ca42f5",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.352159\n",
- "pca(myvars, , n_components=2)[0] 0.446383\n",
- "pca(myvars, , n_components=2)[1] -1.219788\n",
- "ShelveLoc[Good] 4.922780\n",
- "ShelveLoc[Medium] 2.005617\n",
- "dtype: float64"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars',\n",
- " scale=True), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "id": "0dc22e35",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.3522 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n",
- " 0.4469 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n",
- " -1.2213 \n",
- " ShelveLocGood \n",
- " 4.9228 \n",
- " ShelveLocMedium \n",
- " 2.0056 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "70347ee9",
- "metadata": {},
- "source": [
- "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n",
- "of `np.std(ddof=1)`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "aa0c2f2e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.44694166, -1.22131519])"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ab05c497",
- "metadata": {},
- "source": [
- "## Model selection\n",
- "\n",
- "Another task requiring different design matrices is model selection. Manipulating\n",
- "the `terms` attribute of a `ModelSpec` (or more precisely its more uniform version `terms_`)\n",
- "can clearly allow for both exhaustive and stepwise model selection."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "id": "9505c178",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.strategy import (Stepwise, \n",
- " min_max)\n",
- "from ISLP.models.generic_selector import FeatureSelector"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "020c2532",
- "metadata": {},
- "source": [
- "### Best subsets"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "id": "f9aba6db",
- "metadata": {},
- "outputs": [],
- "source": [
- "design = ModelSpec(['Price', \n",
- " 'UIncome', \n",
- " 'Advertising', \n",
- " 'US', \n",
- " 'Income',\n",
- " 'ShelveLoc',\n",
- " 'Education',\n",
- " 'Urban']).fit(Carseats)\n",
- "strategy = min_max(design,\n",
- " min_terms=0,\n",
- " max_terms=3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "id": "91144a3d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.linear_model import LinearRegression\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "ae3cb2eb",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.fit(Carseats, Y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "id": "e63b2744",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Price', 'Advertising', 'ShelveLoc')"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "id": "0a774b48",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('Price',), ('UIncome',), ('Advertising',), ('US',), ('Income',), ('ShelveLoc',), ('Education',), ('Urban',), ('Price', 'UIncome'), ('Price', 'Advertising'), ('Price', 'US'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Price', 'Education'), ('Price', 'Urban'), ('UIncome', 'Advertising'), ('UIncome', 'US'), ('UIncome', 'Income'), ('UIncome', 'ShelveLoc'), ('UIncome', 'Education'), ('UIncome', 'Urban'), ('Advertising', 'US'), ('Advertising', 'Income'), ('Advertising', 'ShelveLoc'), ('Advertising', 'Education'), ('Advertising', 'Urban'), ('US', 'Income'), ('US', 'ShelveLoc'), ('US', 'Education'), ('US', 'Urban'), ('Income', 'ShelveLoc'), ('Income', 'Education'), ('Income', 'Urban'), ('ShelveLoc', 'Education'), ('ShelveLoc', 'Urban'), ('Education', 'Urban'), ('Price', 'UIncome', 'Advertising'), ('Price', 'UIncome', 'US'), ('Price', 'UIncome', 'Income'), ('Price', 'UIncome', 'ShelveLoc'), ('Price', 'UIncome', 'Education'), ('Price', 'UIncome', 'Urban'), ('Price', 'Advertising', 'US'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Advertising', 'Education'), ('Price', 'Advertising', 'Urban'), ('Price', 'US', 'Income'), ('Price', 'US', 'ShelveLoc'), ('Price', 'US', 'Education'), ('Price', 'US', 'Urban'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Income', 'Education'), ('Price', 'Income', 'Urban'), ('Price', 'ShelveLoc', 'Education'), ('Price', 'ShelveLoc', 'Urban'), ('Price', 'Education', 'Urban'), ('UIncome', 'Advertising', 'US'), ('UIncome', 'Advertising', 'Income'), ('UIncome', 'Advertising', 'ShelveLoc'), ('UIncome', 'Advertising', 'Education'), ('UIncome', 'Advertising', 'Urban'), ('UIncome', 'US', 'Income'), ('UIncome', 'US', 'ShelveLoc'), ('UIncome', 'US', 'Education'), ('UIncome', 'US', 'Urban'), ('UIncome', 'Income', 'ShelveLoc'), ('UIncome', 'Income', 'Education'), ('UIncome', 'Income', 'Urban'), ('UIncome', 'ShelveLoc', 'Education'), ('UIncome', 'ShelveLoc', 'Urban'), ('UIncome', 'Education', 'Urban'), ('Advertising', 'US', 'Income'), ('Advertising', 'US', 'ShelveLoc'), ('Advertising', 'US', 'Education'), ('Advertising', 'US', 'Urban'), ('Advertising', 'Income', 'ShelveLoc'), ('Advertising', 'Income', 'Education'), ('Advertising', 'Income', 'Urban'), ('Advertising', 'ShelveLoc', 'Education'), ('Advertising', 'ShelveLoc', 'Urban'), ('Advertising', 'Education', 'Urban'), ('US', 'Income', 'ShelveLoc'), ('US', 'Income', 'Education'), ('US', 'Income', 'Urban'), ('US', 'ShelveLoc', 'Education'), ('US', 'ShelveLoc', 'Urban'), ('US', 'Education', 'Urban'), ('Income', 'ShelveLoc', 'Education'), ('Income', 'ShelveLoc', 'Urban'), ('Income', 'Education', 'Urban'), ('ShelveLoc', 'Education', 'Urban')])"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "id": "0ca1f28c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Price', 'Advertising', 'Income')"
- ]
- },
- "execution_count": 64,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "strategy = min_max(design,\n",
- " min_terms=0,\n",
- " max_terms=3,\n",
- " lower_terms=['Price'],\n",
- " upper_terms=['Price', 'Income', 'Advertising'])\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error')\n",
- "selector.fit(Carseats, Y)\n",
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "id": "5c6732fa",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([('Price',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'Advertising', 'Income')])"
- ]
- },
- "execution_count": 65,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7bb6fcc3",
- "metadata": {},
- "source": [
- "### Stepwise selection"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "id": "9985d0fc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Advertising', 'Income', 'Price', 'ShelveLoc')"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "strategy = Stepwise.first_peak(design,\n",
- " min_terms=0,\n",
- " max_terms=6,\n",
- " lower_terms=['Price'],\n",
- " upper_terms=['Price', 'Income', 'Advertising', 'ShelveLoc', 'UIncome', 'US'\n",
- " 'Education', 'Urban'])\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error',\n",
- " cv=3)\n",
- "selector.fit(Carseats, Y)\n",
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "id": "d3cf3e9b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('Price',), ('Price', 'UIncome'), ('Advertising', 'Price'), ('Income', 'Price'), ('Price', 'ShelveLoc'), ('Price', 'Urban'), ('Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Price', 'ShelveLoc'), ('Income', 'Price', 'ShelveLoc'), ('Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc'), ('Advertising', 'Price', 'ShelveLoc', 'Urban'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'UIncome'), ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban')])"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "id": "dd43ea7c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{(): -8.055847677297269,\n",
- " ('Price',): -6.514630258019962,\n",
- " ('Price', 'UIncome'): -6.621654905418576,\n",
- " ('Advertising', 'Price'): -5.825225309857156,\n",
- " ('Income', 'Price'): -6.455432795910743,\n",
- " ('Price', 'ShelveLoc'): -3.780183168075897,\n",
- " ('Price', 'Urban'): -6.5430157266926114,\n",
- " ('Price', 'ShelveLoc', 'UIncome'): -3.6938729706475004,\n",
- " ('Advertising', 'Price', 'ShelveLoc'): -3.2067316025050645,\n",
- " ('Income', 'Price', 'ShelveLoc'): -3.634698914456587,\n",
- " ('Price', 'ShelveLoc', 'Urban'): -3.776148947585277,\n",
- " ('Advertising', 'Price', 'ShelveLoc', 'UIncome'): -3.1240961493998642,\n",
- " ('Advertising', 'Income', 'Price', 'ShelveLoc'): -3.0801704971796244,\n",
- " ('Advertising', 'Price', 'ShelveLoc', 'Urban'): -3.207569489139369,\n",
- " ('Advertising',\n",
- " 'Income',\n",
- " 'Price',\n",
- " 'ShelveLoc',\n",
- " 'UIncome'): -3.1048826894036115,\n",
- " ('Advertising', 'Income', 'Price', 'ShelveLoc', 'Urban'): -3.0867130108677423}"
- ]
- },
- "execution_count": 68,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.results_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "id": "7c026f0a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Advertising', 'Income', 'Price', 'ShelveLoc')"
- ]
- },
- "execution_count": 69,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b4b89d04",
- "metadata": {},
- "source": [
- "### Enforcing constraints\n",
- "\n",
- "In models with interactions, we may often want to impose constraints on interactions and main effects.\n",
- "This can be achieved here by use of a `validator` that checks whether a given model is valid.\n",
- "\n",
- "Suppose we want to have the following constraint: `ShelveLoc` may not be in the model unless\n",
- "`Price` is in the following model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "id": "1c1e31d0",
- "metadata": {},
- "outputs": [],
- "source": [
- "design = ModelSpec(['Price', \n",
- " 'Advertising', \n",
- " 'Income',\n",
- " 'ShelveLoc']).fit(Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "be929807",
- "metadata": {},
- "source": [
- "The constraints are described with a boolean matrix with `(i,j)` as `j` is a child of `i`: so `j` should not\n",
- "be in the model when `i` is not and enforced with a callable `validator` that evaluates each candidate state.\n",
- "\n",
- "Both `min_max_strategy` and `step_strategy` accept a `validator` argument."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "id": "c075b1b7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('Price',), ('Advertising',), ('Income',), ('Price', 'Advertising'), ('Price', 'Income'), ('Price', 'ShelveLoc'), ('Advertising', 'Income'), ('Price', 'Advertising', 'Income'), ('Price', 'Advertising', 'ShelveLoc'), ('Price', 'Income', 'ShelveLoc'), ('Price', 'Advertising', 'Income', 'ShelveLoc')])"
- ]
- },
- "execution_count": 71,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.strategy import validator_from_constraints\n",
- "constraints = np.zeros((4, 4))\n",
- "constraints[0,3] = 1\n",
- "strategy = min_max(design,\n",
- " min_terms=0,\n",
- " max_terms=4,\n",
- " validator=validator_from_constraints(design,\n",
- " constraints))\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error',\n",
- " cv=3)\n",
- "selector.fit(Carseats, Y)\n",
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "id": "3472d47c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "('Price', 'Advertising', 'Income', 'ShelveLoc')"
- ]
- },
- "execution_count": 72,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "selector.selected_state_"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "id": "5d2c82b9",
- "metadata": {},
- "outputs": [],
- "source": [
- "Hitters=load_data('Hitters')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
- "id": "4b2ac2c2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n",
- " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n",
- " 'PutOuts', 'Assists', 'Errors', 'Salary', 'NewLeague'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 74,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Hitters.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "id": "bd2ad0dd",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dict_keys([(), ('AtBat',), ('Hits',), ('HmRun',), ('Runs',), ('RBI',), ('Walks',), ('Years',), ('CAtBat',), ('CHits',), ('CHmRun',), ('CRuns',), ('CRBI',), ('CWalks',), ('League',), ('Division',), ('PutOuts',), ('Assists',), ('Errors',), ('NewLeague',), ('AtBat', 'CRBI'), ('CRBI', 'Hits'), ('CRBI', 'HmRun'), ('CRBI', 'Runs'), ('CRBI', 'RBI'), ('CRBI', 'Walks'), ('CRBI', 'Years'), ('CAtBat', 'CRBI'), ('CHits', 'CRBI'), ('CHmRun', 'CRBI'), ('CRBI', 'CRuns'), ('CRBI', 'CWalks'), ('CRBI', 'League'), ('CRBI', 'Division'), ('CRBI', 'PutOuts'), ('Assists', 'CRBI'), ('CRBI', 'Errors'), ('CRBI', 'NewLeague'), ('AtBat', 'CRBI', 'Hits'), ('CRBI', 'Hits', 'HmRun'), ('CRBI', 'Hits', 'Runs'), ('CRBI', 'Hits', 'RBI'), ('CRBI', 'Hits', 'Walks'), ('CRBI', 'Hits', 'Years'), ('CAtBat', 'CRBI', 'Hits'), ('CHits', 'CRBI', 'Hits'), ('CHmRun', 'CRBI', 'Hits'), ('CRBI', 'CRuns', 'Hits'), ('CRBI', 'CWalks', 'Hits'), ('CRBI', 'Hits', 'League'), ('CRBI', 'Division', 'Hits'), ('CRBI', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits'), ('CRBI', 'Errors', 'Hits'), ('CRBI', 'Hits', 'NewLeague'), ('AtBat', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'League', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts'), ('Assists', 'CRBI', 'Hits', 'PutOuts'), ('CRBI', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts'), ('AtBat', 'CRBI', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks'), ('Assists', 'AtBat', 'CAtBat', 'CHits', 'CHmRun', 'CRBI', 'CRuns', 'CWalks', 'Division', 'Errors', 'Hits', 'HmRun', 'League', 'NewLeague', 'PutOuts', 'RBI', 'Runs', 'Walks', 'Years')])"
- ]
- },
- "execution_count": 75,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Hitters = Hitters.dropna()\n",
- "Y=Hitters['Salary']\n",
- "X=Hitters.drop('Salary', axis=1)\n",
- "design = ModelSpec(X.columns).fit(X)\n",
- "strategy = Stepwise.first_peak(design,\n",
- " direction='forward',\n",
- " min_terms=0,\n",
- " max_terms=19)\n",
- "selector = FeatureSelector(LinearRegression(fit_intercept=False),\n",
- " strategy,\n",
- " scoring='neg_mean_squared_error', cv=None)\n",
- "selector.fit(X, Y)\n",
- "selector.results_.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "id": "31788748",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19"
- ]
- },
- "execution_count": 76,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(selector.selected_state_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 77,
- "id": "e97d80c3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "19"
- ]
- },
- "execution_count": 77,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(X.columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a71f0332",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Start: AIC=3215.77\n",
- "Salary ~ 1\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ CRBI 1 17139434 36179679 3115.8\n",
- "+ CRuns 1 16881162 36437951 3117.6\n",
- "+ CHits 1 16065140 37253973 3123.5\n",
- "+ CAtBat 1 14759710 38559403 3132.5\n",
- "+ CHmRun 1 14692193 38626920 3133.0\n",
- "+ CWalks 1 12792622 40526491 3145.6\n",
- "+ RBI 1 10771083 42548030 3158.4\n",
- "+ Walks 1 10504833 42814280 3160.1\n",
- "+ Hits 1 10260491 43058621 3161.6\n",
- "+ Runs 1 9399158 43919955 3166.8\n",
- "+ Years 1 8559105 44760007 3171.7\n",
- "+ AtBat 1 8309469 45009644 3173.2\n",
- "+ HmRun 1 6273967 47045145 3184.8\n",
- "+ PutOuts 1 4814100 48505013 3192.9\n",
- "+ Division 1 1976102 51343011 3207.8\n",
- " 53319113 3215.8\n",
- "+ Assists 1 34497 53284615 3217.6\n",
- "+ League 1 10876 53308237 3217.7\n",
- "+ Errors 1 1555 53317558 3217.8\n",
- "+ NewLeague 1 428 53318684 3217.8\n",
- "\n",
- "Step: AIC=3115.78\n",
- "Salary ~ CRBI\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ Hits 1 5533119 30646560 3074.1\n",
- "+ Runs 1 5176532 31003147 3077.2\n",
- "+ Walks 1 4199733 31979946 3085.3\n",
- "+ AtBat 1 4064585 32115095 3086.4\n",
- "+ RBI 1 3308272 32871407 3092.6\n",
- "+ PutOuts 1 3267035 32912644 3092.9\n",
- "+ Division 1 1733887 34445793 3104.9\n",
- "+ Years 1 1667339 34512340 3105.4\n",
- "+ HmRun 1 1271587 34908092 3108.4\n",
- "+ CRuns 1 354561 35825119 3115.2\n",
- "+ Assists 1 346020 35833659 3115.2\n",
- " 36179679 3115.8\n",
- "+ Errors 1 194403 35985276 3116.4\n",
- "+ CAtBat 1 92261 36087418 3117.1\n",
- "+ CHits 1 75469 36104210 3117.2\n",
- "+ CWalks 1 51974 36127705 3117.4\n",
- "+ NewLeague 1 17778 36161901 3117.7\n",
- "+ League 1 11825 36167855 3117.7\n",
- "+ CHmRun 1 515 36179165 3117.8\n",
- "\n",
- "Step: AIC=3074.13\n",
- "Salary ~ CRBI + Hits\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ PutOuts 1 1397263 29249297 3063.8\n",
- "+ Division 1 1279275 29367285 3064.9\n",
- "+ AtBat 1 821767 29824793 3069.0\n",
- "+ Walks 1 781767 29864793 3069.3\n",
- "+ Years 1 254910 30391650 3073.9\n",
- " 30646560 3074.1\n",
- "+ League 1 208880 30437680 3074.3\n",
- "+ CRuns 1 132614 30513946 3075.0\n",
- "+ NewLeague 1 118474 30528086 3075.1\n",
- "+ Runs 1 114198 30532362 3075.1\n",
- "+ Errors 1 99776 30546784 3075.3\n",
- "+ CAtBat 1 83517 30563043 3075.4\n",
- "+ Assists 1 44781 30601779 3075.7\n",
- "+ CWalks 1 23668 30622892 3075.9\n",
- "+ CHmRun 1 4790 30641769 3076.1\n",
- "+ CHits 1 4358 30642202 3076.1\n",
- "+ HmRun 1 2173 30644387 3076.1\n",
- "+ RBI 1 1137 30645423 3076.1\n",
- "\n",
- "Step: AIC=3063.85\n",
- "Salary ~ CRBI + Hits + PutOuts\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ Division 1 1278445 27970852 3054.1\n",
- "+ AtBat 1 1009933 28239364 3056.6\n",
- "+ Walks 1 539490 28709807 3061.0\n",
- "+ CRuns 1 273649 28975648 3063.4\n",
- " 29249297 3063.8\n",
- "+ Years 1 136906 29112391 3064.6\n",
- "+ League 1 122841 29126456 3064.8\n",
- "+ Runs 1 117930 29131367 3064.8\n",
- "+ Errors 1 97244 29152053 3065.0\n",
- "+ NewLeague 1 57839 29191458 3065.3\n",
- "+ CHits 1 35096 29214201 3065.5\n",
- "+ RBI 1 33965 29215331 3065.6\n",
- "+ HmRun 1 31227 29218070 3065.6\n",
- "+ CWalks 1 28572 29220725 3065.6\n",
- "+ CAtBat 1 20518 29228779 3065.7\n",
- "+ Assists 1 1681 29247616 3065.8\n",
- "+ CHmRun 1 1419 29247878 3065.8\n",
- "\n",
- "Step: AIC=3054.1\n",
- "Salary ~ CRBI + Hits + PutOuts + Division\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ AtBat 1 820952 27149899 3048.3\n",
- "+ Walks 1 491584 27479268 3051.4\n",
- " 27970852 3054.1\n",
- "+ CRuns 1 193604 27777248 3054.3\n",
- "+ Years 1 166845 27804007 3054.5\n",
- "+ League 1 110628 27860224 3055.1\n",
- "+ Errors 1 81385 27889467 3055.3\n",
- "+ Runs 1 65921 27904931 3055.5\n",
- "+ RBI 1 53719 27917133 3055.6\n",
- "+ NewLeague 1 52275 27918577 3055.6\n",
- "+ CHits 1 33863 27936989 3055.8\n",
- "+ HmRun 1 26390 27944462 3055.8\n",
- "+ CAtBat 1 18751 27952101 3055.9\n",
- "+ CWalks 1 5723 27965129 3056.0\n",
- "+ Assists 1 1036 27969816 3056.1\n",
- "+ CHmRun 1 165 27970687 3056.1\n",
- "\n",
- "Step: AIC=3048.26\n",
- "Salary ~ CRBI + Hits + PutOuts + Division + AtBat\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ Walks 1 954996 26194904 3040.8\n",
- "+ Years 1 253362 26896537 3047.8\n",
- "+ Runs 1 208743 26941157 3048.2\n",
- " 27149899 3048.3\n",
- "+ CRuns 1 185825 26964075 3048.5\n",
- "+ League 1 95986 27053913 3049.3\n",
- "+ NewLeague 1 52693 27097206 3049.8\n",
- "+ CHmRun 1 43173 27106726 3049.8\n",
- "+ Assists 1 28898 27121001 3050.0\n",
- "+ CAtBat 1 20989 27128910 3050.1\n",
- "+ CWalks 1 15599 27134301 3050.1\n",
- "+ Errors 1 6265 27143634 3050.2\n",
- "+ CHits 1 5305 27144594 3050.2\n",
- "+ RBI 1 1236 27148663 3050.2\n",
- "+ HmRun 1 11 27149888 3050.3\n",
- "\n",
- "Step: AIC=3040.85\n",
- "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ CWalks 1 240687 25954217 3040.4\n",
- " 26194904 3040.8\n",
- "+ Years 1 184508 26010396 3041.0\n",
- "+ CRuns 1 110695 26084209 3041.7\n",
- "+ League 1 77974 26116930 3042.1\n",
- "+ Assists 1 75782 26119122 3042.1\n",
- "+ NewLeague 1 40909 26153995 3042.4\n",
- "+ CHits 1 37304 26157599 3042.5\n",
- "+ RBI 1 11728 26183176 3042.7\n",
- "+ HmRun 1 4747 26190157 3042.8\n",
- "+ Errors 1 2727 26192177 3042.8\n",
- "+ CAtBat 1 2630 26192274 3042.8\n",
- "+ CHmRun 1 943 26193961 3042.8\n",
- "+ Runs 1 37 26194867 3042.8\n",
- "\n",
- "Step: AIC=3040.42\n",
- "Salary ~ CRBI + Hits + PutOuts + Division + AtBat + Walks + CWalks\n",
- "\n",
- " Df Sum of Sq RSS AIC\n",
- "+ CRuns 1 794983 25159234 3034.2\n",
- "+ CHits 1 273728 25680489 3039.6\n",
- " 25954217 3040.4\n",
- "+ Assists 1 138506 25815711 3041.0\n",
- "+ CAtBat 1 89289 25864929 3041.5\n",
- "+ RBI 1 86941 25867276 3041.5\n",
- "+ League 1 77159 25877058 3041.6\n",
- "+ Years 1 70126 25884091 3041.7\n",
- "+ NewLeague 1 37807 25916410 3042.0\n",
- "+ HmRun 1 33601 25920616 3042.1\n",
- "+ CHmRun 1 9034 25945183 3042.3\n",
- "+ Errors 1 6928"
- ]
- }
- ],
- "source": [
- "%%R -i Hitters\n",
- "step(lm(Salary ~ 1, data=Hitters), scope=list(upper=lm(Salary ~ ., data=Hitters)), direction='forward', trace=TRUE)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6117f650",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "536a8bc3",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bddc13c5",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/docs/source/models/spec.ipynb b/docs/source/models/spec.ipynb
index d6ba7b0..fce6b32 100644
--- a/docs/source/models/spec.ipynb
+++ b/docs/source/models/spec.ipynb
@@ -31,7 +31,7 @@
"from ISLP.models import (ModelSpec,\n",
" summarize,\n",
" Column,\n",
- " Variable,\n",
+ " Feature,\n",
" build_columns)\n",
"\n",
"import statsmodels.api as sm"
@@ -257,7 +257,7 @@
"metadata": {},
"source": [
"We note that a column has been added for the intercept by default. This can be changed using the\n",
- "`intercept` argument. "
+ "`intercept` argument."
]
},
{
@@ -391,7 +391,7 @@
"in the column space of the design matrix.\n",
"\n",
"To include this intercept via `ShelveLoc` we can use 3 columns to encode this categorical variable. Following the nomenclature of\n",
- "`R`, we call this a `Contrast` of the categorical variable. "
+ "`R`, we call this a `Contrast` of the categorical variable."
]
},
{
@@ -597,14 +597,6 @@
"shelve.get_columns(Carseats)"
]
},
- {
- "cell_type": "markdown",
- "id": "5d8b048f-3c31-47ac-8946-0662f5e57b63",
- "metadata": {},
- "source": [
- "shelve.get_columns?"
- ]
- },
{
"cell_type": "markdown",
"id": "269e6d18-4ae4-4a77-8498-90281ae7c803",
@@ -946,7 +938,7 @@
"\n",
"The first argument to `ModelSpec` is stored as the `terms` attribute. Under the hood,\n",
"this sequence is inspected to produce the `terms_` attribute which specify the objects\n",
- "that will ultimately create the design matrix. "
+ "that will ultimately create the design matrix."
]
},
{
@@ -958,8 +950,8 @@
{
"data": {
"text/plain": [
- "[Variable(variables=('ShelveLoc',), name='ShelveLoc', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
+ "[Feature(variables=('ShelveLoc',), name='ShelveLoc', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
+ " Feature(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
]
},
"execution_count": 13,
@@ -978,7 +970,7 @@
"id": "warming-mobile",
"metadata": {},
"source": [
- "Each element of `terms_` should be a `Variable` which describes a set of columns to be extracted from\n",
+ "Each element of `terms_` should be a `Feature` which describes a set of columns to be extracted from\n",
"a columnar data form as well as possible a possible encoder."
]
},
@@ -1134,17 +1126,17 @@
"id": "former-spring",
"metadata": {},
"source": [
- "### `Variable` objects\n",
+ "### `Feature` objects\n",
"\n",
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
+ "Note that `Feature` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
"tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
+ "is run through `encoder.transform`. The `encoder.fit` method of each `Feature` is run once during \n",
"the call to `ModelSpec.fit`."
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 18,
"id": "floral-liabilities",
"metadata": {},
"outputs": [
@@ -1263,15 +1255,13 @@
"[400 rows x 3 columns]"
]
},
- "execution_count": 16,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)\n",
+ "new_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=None)\n",
"build_columns(MS.column_info_,\n",
" Carseats, \n",
" new_var)[0]"
@@ -1288,18 +1278,10 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 20,
"id": "imported-measure",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
{
"data": {
"text/html": [
@@ -1403,7 +1385,7 @@
"[400 rows x 2 columns]"
]
},
- "execution_count": 17,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -1412,7 +1394,7 @@
"from sklearn.decomposition import PCA\n",
"pca = PCA(n_components=2)\n",
"pca.fit(build_columns(MS.column_info_, Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)\n",
+ "pca_var = Feature(('Price', 'Income', 'OIncome'), name='mynewvar', encoder=pca)\n",
"build_columns(MS.column_info_,\n",
" Carseats, \n",
" pca_var)[0]"
@@ -1424,23 +1406,15 @@
"metadata": {},
"source": [
"The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
+ "or `Feature` instances (`pca_var`)."
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 21,
"id": "western-bloom",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
{
"data": {
"text/html": [
@@ -1568,14 +1542,14 @@
"[400 rows x 4 columns]"
]
},
- "execution_count": 18,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"price = MS.column_info_['Price']\n",
- "fancy_var = Variable(('Income', price, pca_var), name='fancy', encoder=None)\n",
+ "fancy_var = Feature(('Income', price, pca_var), name='fancy', encoder=None)\n",
"build_columns(MS.column_info_,\n",
" Carseats, \n",
" fancy_var)[0]"
@@ -1583,121 +1557,95 @@
},
{
"cell_type": "markdown",
- "id": "absent-branch",
+ "id": "e289feba-e3f5-48e0-9e29-cdd88d7f9923",
"metadata": {},
"source": [
- "## Predicting at new points\n",
- "\n",
- "As `ModelSpec` is a transformer, it can be evaluated at new feature values.\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
+ "## Predicting at new points"
]
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "naked-hollywood",
+ "execution_count": 22,
+ "id": "6efed2fa-9e5d-429c-a8d9-ac544cab2b41",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "array([ 9.73389663, 26.06456997])"
+ "intercept 12.661546\n",
+ "Price -0.052213\n",
+ "Income 0.012829\n",
+ "dtype: float64"
]
},
- "execution_count": 19,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "new_data = pd.DataFrame({'Income':['Bad', 'Good'], 'Price':[40, 50]})\n",
- "new_X = MS.transform(new_data)\n",
- "M_ols.get_prediction(new_X).predicted_mean"
+ "MS = ModelSpec(['Price', 'Income']).fit(Carseats)\n",
+ "X = MS.transform(Carseats)\n",
+ "Y = Carseats['Sales']\n",
+ "M_ols = sm.OLS(Y, X).fit()\n",
+ "M_ols.params"
]
},
{
"cell_type": "markdown",
- "id": "signal-yahoo",
+ "id": "e6b4609b-fcb2-4cc2-b630-509df4c87546",
"metadata": {},
"source": [
- "## Using `np.ndarray`\n",
- "\n",
- "As the basic model is to concatenate columns extracted from a columnar data\n",
- "representation, one *can* use `np.ndarray` as the column data. In this case,\n",
- "columns will be selected by integer indices. \n",
- "\n",
- "### Caveats using `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "However,\n",
- "unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.\n",
- "\n",
- "However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. \n",
- "\n",
- "We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new\n",
- "values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties\n",
- "in using `np.ndarray`."
+ "As `ModelSpec` is a transformer, it can be evaluated at new feature values.\n",
+ "Constructing the design matrix at any values is carried out by the `transform` method."
]
},
{
"cell_type": "code",
- "execution_count": 42,
- "id": "964ecc79-7303-410c-b258-2d58341c7dc0",
+ "execution_count": 23,
+ "id": "8784b0e8-ce53-4a90-aee6-b935834295c7",
"metadata": {},
"outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " intercept Price Income\n",
- "0 1.0 40 10\n",
- "1 1.0 50 20\n"
- ]
- },
{
"data": {
"text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
+ "array([10.70130676, 10.307465 ])"
]
},
- "execution_count": 42,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "MS = ModelSpec(['Price', 'Income']).fit(Carseats)\n",
- "M_ols = sm.OLS(Y, MS.transform(Carseats)).fit()\n",
- "\n",
"new_data = pd.DataFrame({'Price':[40, 50], 'Income':[10, 20]})\n",
"new_X = MS.transform(new_data)\n",
- "print(new_X)\n",
- "M_ols.params"
+ "M_ols.get_prediction(new_X).predicted_mean"
]
},
{
- "cell_type": "code",
- "execution_count": 25,
- "id": "a42c239e-a5eb-4c5d-919e-16c4d58d1c8d",
+ "cell_type": "markdown",
+ "id": "signal-yahoo",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([10.70130676, 10.307465 ])"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "M_ols.get_prediction(new_X).predicted_mean"
+ "## Using `np.ndarray`\n",
+ "\n",
+ "As the basic model is to concatenate columns extracted from a columnar data\n",
+ "representation, one *can* use `np.ndarray` as the column data. In this case,\n",
+ "columns will be selected by integer indices. \n",
+ "\n",
+ "### Caveats using `np.ndarray`\n",
+ "\n",
+ "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
+ "However,\n",
+ "unless all features are floats, `np.ndarray` will default to a dtype of `object`, complicating issues.\n",
+ "\n",
+ "However, if we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
+ "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning. \n",
+ "\n",
+ "We illustrate this below, where we build a model from `Price` and `Income` for `Sales` and want to find predictions at new\n",
+ "values of `Price` and `Location`. We first find the predicitions using `pd.DataFrame` and then illustrate the difficulties\n",
+ "in using `np.ndarray`."
]
},
{
@@ -1710,7 +1658,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 24,
"id": "4fec9030-7445-48be-a15f-2ac5a789e717",
"metadata": {},
"outputs": [
@@ -1726,7 +1674,7 @@
" [ 1., 120., 37.]])"
]
},
- "execution_count": 34,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1739,7 +1687,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 25,
"id": "c864e365-2476-4ca6-9d27-625cac2b2271",
"metadata": {},
"outputs": [
@@ -1752,7 +1700,7 @@
"dtype: float64"
]
},
- "execution_count": 36,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -1779,7 +1727,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 26,
"id": "incredible-concert",
"metadata": {},
"outputs": [
@@ -1813,7 +1761,7 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 27,
"id": "stunning-container",
"metadata": {},
"outputs": [
@@ -1831,7 +1779,7 @@
"array([10.70130676, 10.307465 ])"
]
},
- "execution_count": 45,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@@ -1840,7 +1788,7 @@
"new_D = np.array([[40,50], [np.nan, np.nan], [10,20]]).T\n",
"new_X = MS_np.transform(new_D)\n",
"print(new_X)\n",
- "M_ols.get_prediction(new_X).predicted_mean\n"
+ "M_ols.get_prediction(new_X).predicted_mean"
]
},
{
@@ -1855,10 +1803,10 @@
],
"metadata": {
"jupytext": {
- "formats": "ipynb"
+ "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
},
"kernelspec": {
- "display_name": "python3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -1872,7 +1820,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.10.10"
}
},
"nbformat": 4,
diff --git a/docs/source/models/submodels.ipynb b/docs/source/models/submodels.ipynb
deleted file mode 100644
index 825bedd..0000000
--- a/docs/source/models/submodels.ipynb
+++ /dev/null
@@ -1,3127 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "ee33d364",
- "metadata": {},
- "source": [
- "# Building design matrices with `ModelSpec`\n",
- "\n",
- "Force rebuild"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "4c70fbaa",
- "metadata": {},
- "outputs": [],
- "source": [
- "x=4\n",
- "import numpy as np, pandas as pd\n",
- "%load_ext rpy2.ipython\n",
- "\n",
- "from ISLP import load_data\n",
- "from ISLP.models import ModelSpec\n",
- "\n",
- "import statsmodels.api as sm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "8a708215",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',\n",
- " 'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats = load_data('Carseats')\n",
- "%R -i Carseats\n",
- "Carseats.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dad5e991",
- "metadata": {},
- "source": [
- "## Let's break up income into groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "ac7086a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: OIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L' < 'M' < 'H']"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['OIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'])\n",
- "Carseats['OIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "261446c8",
- "metadata": {},
- "source": [
- "Let's also create an unordered version"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "674bb806",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 M\n",
- "1 L\n",
- "2 L\n",
- "3 H\n",
- "4 M\n",
- " ..\n",
- "395 H\n",
- "396 L\n",
- "397 L\n",
- "398 M\n",
- "399 L\n",
- "Name: UIncome, Length: 400, dtype: category\n",
- "Categories (3, object): ['L', 'M', 'H']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['UIncome'] = pd.cut(Carseats['Income'], \n",
- " [0,50,90,200], \n",
- " labels=['L','M','H'],\n",
- " ordered=False)\n",
- "Carseats['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8f030039",
- "metadata": {},
- "source": [
- "## A simple model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "40cd6c28",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Price', 'Income'], dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Price', 'Income'])\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "e65f5607",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 12.661546\n",
- "Price -0.052213\n",
- "Income 0.012829\n",
- "dtype: float64"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Y = Carseats['Sales']\n",
- "M = sm.OLS(Y, X).fit()\n",
- "M.params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "29d9b55f",
- "metadata": {},
- "source": [
- "## Basic procedure\n",
- "\n",
- "The design matrix is built by cobbling together a set of columns and possibly transforming them.\n",
- "A `pd.DataFrame` is essentially a list of columns. One of the first tasks done in `ModelSpec.fit`\n",
- "is to inspect a dataframe for column info. The column `ShelveLoc` is categorical:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "cfbe5b92",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 Bad\n",
- "1 Good\n",
- "2 Medium\n",
- "3 Medium\n",
- "4 Bad\n",
- " ... \n",
- "395 Good\n",
- "396 Medium\n",
- "397 Medium\n",
- "398 Bad\n",
- "399 Good\n",
- "Name: ShelveLoc, Length: 400, dtype: category\n",
- "Categories (3, object): ['Bad', 'Good', 'Medium']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7092f666",
- "metadata": {},
- "source": [
- "This is recognized by `ModelSpec` in the form of `Column` objects which are just named tuples with two methods\n",
- "`get_columns` and `fit_encoder`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "e2d43844",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='ShelveLoc', name='ShelveLoc', is_categorical=True, is_ordinal=False, columns=('ShelveLoc[Good]', 'ShelveLoc[Medium]'), encoder=Contrast())"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['ShelveLoc']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "46a01612",
- "metadata": {},
- "source": [
- "It recognized ordinal columns as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "465a9326",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='OIncome', name='OIncome', is_categorical=True, is_ordinal=True, columns=('OIncome',), encoder=OrdinalEncoder())"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['OIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "76f8480d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([ 73, 48, 35, 100]), ('Income',))"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "income = design.column_info_['Income']\n",
- "cols, names = income.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "25fcc1de",
- "metadata": {},
- "source": [
- "## Encoding a column\n",
- "\n",
- "In building a design matrix we must extract columns from our dataframe (or `np.ndarray`). Categorical\n",
- "variables usually are encoded by several columns, typically one less than the number of categories.\n",
- "This task is handled by the `encoder` of the `Column`. The encoder must satisfy the `sklearn` transform\n",
- "model, i.e. `fit` on some array and `transform` on future arrays. The `fit_encoder` method of `Column` fits\n",
- "its encoder the first time data is passed to it."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "dfe6cc35",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(array([[0., 0.],\n",
- " [1., 0.],\n",
- " [0., 1.],\n",
- " [0., 1.]]),\n",
- " ['ShelveLoc[Good]', 'ShelveLoc[Medium]'])"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "shelve = design.column_info_['ShelveLoc']\n",
- "cols, names = shelve.get_columns(Carseats)\n",
- "(cols[:4], names)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "8fc9779a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[2.],\n",
- " [1.],\n",
- " [1.],\n",
- " [0.]])"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "oincome = design.column_info_['OIncome']\n",
- "oincome.get_columns(Carseats)[0][:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8e04da60",
- "metadata": {},
- "source": [
- "## The terms\n",
- "\n",
- "The design matrix consists of several sets of columns. This is managed by the `ModelSpec` through\n",
- "the `terms` argument which should be a sequence. The elements of `terms` are often\n",
- "going to be strings (or tuples of strings for interactions, see below) but are converted to a\n",
- "`Variable` object and stored in the `terms_` of the fitted `ModelSpec`. A `Variable` is just a named tuple."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "c579dbce",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Price', 'Income']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "4587b8bd",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=('Price',), name='Price', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=('Income',), name='Income', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2595f0fa",
- "metadata": {},
- "source": [
- "While each `Column` can itself extract data, they are all promoted to `Variable` to be of a uniform type. A\n",
- "`Variable` can also create columns through the `build_columns` method of `ModelSpec`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "03bd9366",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price\n",
- " 0 120\n",
- " 1 83\n",
- " 2 80\n",
- " 3 97\n",
- " 4 128\n",
- " .. ...\n",
- " 395 128\n",
- " 396 120\n",
- " 397 159\n",
- " 398 95\n",
- " 399 120\n",
- " \n",
- " [400 rows x 1 columns],\n",
- " ['Price'])"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "price = design.terms_[0]\n",
- "design.build_columns(Carseats, price)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "de04ca48",
- "metadata": {},
- "source": [
- "Note that `Variable` objects have a tuple of `variables` as well as an `encoder` attribute. The\n",
- "tuple of `variables` first creates a concatenated dataframe from all corresponding variables and then\n",
- "is run through `encoder.transform`. The `encoder.fit` method of each `Variable` is run once during \n",
- "the call to `ModelSpec.fit`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "a42af4c5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( Price Income UIncome[L] UIncome[M]\n",
- " 0 120.0 73.0 0.0 1.0\n",
- " 1 83.0 48.0 1.0 0.0\n",
- " 2 80.0 35.0 1.0 0.0\n",
- " 3 97.0 100.0 0.0 0.0\n",
- " 4 128.0 64.0 0.0 1.0\n",
- " .. ... ... ... ...\n",
- " 395 128.0 108.0 0.0 0.0\n",
- " 396 120.0 23.0 1.0 0.0\n",
- " 397 159.0 26.0 1.0 0.0\n",
- " 398 95.0 79.0 0.0 1.0\n",
- " 399 120.0 37.0 1.0 0.0\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Income', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import Variable\n",
- "\n",
- "new_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=None)\n",
- "design.build_columns(Carseats, new_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b146d0c0",
- "metadata": {},
- "source": [
- "Let's now transform these columns with an encoder. Within `ModelSpec` we will first build the\n",
- "arrays above and then call `pca.fit` and finally `pca.transform` within `design.build_columns`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "b6c394a6",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( mynewvar[0] mynewvar[1]\n",
- " 0 -3.608693 -4.853177\n",
- " 1 15.081506 35.708630\n",
- " 2 27.422871 40.774250\n",
- " 3 -33.973209 13.470489\n",
- " 4 6.567316 -11.290100\n",
- " .. ... ...\n",
- " 395 -36.846346 -18.415783\n",
- " 396 45.741500 3.245602\n",
- " 397 49.097533 -35.725355\n",
- " 398 -13.577772 18.845139\n",
- " 399 31.927566 0.978436\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.decomposition import PCA\n",
- "pca = PCA(n_components=2)\n",
- "pca.fit(design.build_columns(Carseats, new_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca_var = Variable(('Price', 'Income', 'UIncome'), name='mynewvar', encoder=pca)\n",
- "design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3bb30a3f",
- "metadata": {},
- "source": [
- "The elements of the `variables` attribute may be column identifiers ( `\"Price\"`), `Column` instances (`price`)\n",
- "or `Variable` instances (`pca_var`)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "ea7770ff",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( Price Price mynewvar[0] mynewvar[1]\n",
- " 0 120.0 120.0 -3.608693 -4.853177\n",
- " 1 83.0 83.0 15.081506 35.708630\n",
- " 2 80.0 80.0 27.422871 40.774250\n",
- " 3 97.0 97.0 -33.973209 13.470489\n",
- " 4 128.0 128.0 6.567316 -11.290100\n",
- " .. ... ... ... ...\n",
- " 395 128.0 128.0 -36.846346 -18.415783\n",
- " 396 120.0 120.0 45.741500 3.245602\n",
- " 397 159.0 159.0 49.097533 -35.725355\n",
- " 398 95.0 95.0 -13.577772 18.845139\n",
- " 399 120.0 120.0 31.927566 0.978436\n",
- " \n",
- " [400 rows x 4 columns],\n",
- " ['Price', 'Price', 'mynewvar[0]', 'mynewvar[1]'])"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "fancy_var = Variable(('Price', price, pca_var), name='fancy', encoder=None)\n",
- "design.build_columns(Carseats, fancy_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b2b4a01a",
- "metadata": {},
- "source": [
- "We can of course run PCA again on these features (if we wanted)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "21ad8b44",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "( fancy_pca[0] fancy_pca[1]\n",
- " 0 -6.951792 4.859283\n",
- " 1 55.170148 -24.694875\n",
- " 2 59.418556 -38.033572\n",
- " 3 34.722389 28.922184\n",
- " 4 -21.419184 -3.120673\n",
- " .. ... ...\n",
- " 395 -18.257348 40.760122\n",
- " 396 -10.546709 -45.021658\n",
- " 397 -77.706359 -37.174379\n",
- " 398 36.668694 7.730851\n",
- " 399 -9.540535 -31.059122\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['fancy_pca[0]', 'fancy_pca[1]'])"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pca2 = PCA(n_components=2)\n",
- "pca2.fit(design.build_columns(Carseats, fancy_var)[0]) # this is done within `ModelSpec.fit`\n",
- "pca2_var = Variable(('Price', price, pca_var), name='fancy_pca', encoder=pca2)\n",
- "design.build_columns(Carseats, pca2_var)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2262377d",
- "metadata": {},
- "source": [
- "## Building the design matrix\n",
- "\n",
- "With these notions in mind, the final design is essentially then"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "1654ca47",
- "metadata": {},
- "outputs": [],
- "source": [
- "X_hand = np.column_stack([design.build_columns(Carseats, v)[0] for v in design.terms_])[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1db0e0a9",
- "metadata": {},
- "source": [
- "An intercept column is added if `design.intercept` is `True` and if the original argument to `transform` is\n",
- "a dataframe the index is adjusted accordingly."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "d20e8ea8",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.intercept"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "450fe910",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " Price | \n",
- " Income | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 120 | \n",
- " 73 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 83 | \n",
- " 48 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 80 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 97 | \n",
- " 100 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " intercept Price Income\n",
- "0 1.0 120 73\n",
- "1 1.0 83 48\n",
- "2 1.0 80 35\n",
- "3 1.0 97 100"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.transform(Carseats)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0705ba6f",
- "metadata": {},
- "source": [
- "## Predicting\n",
- "\n",
- "Constructing the design matrix at any values is carried out by the `transform` method."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "866c2863",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_data = pd.DataFrame({'Price':[10,20], 'Income':[40, 50]})\n",
- "new_X = design.transform(new_data)\n",
- "M.get_prediction(new_X).predicted_mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "f2021166",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0 1 \n",
- "12.65258 12.25873 \n"
- ]
- }
- ],
- "source": [
- "%%R -i new_data,Carseats\n",
- "predict(lm(Sales ~ Price + Income, data=Carseats), new_data)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20e1a31a",
- "metadata": {},
- "source": [
- "### Difference between using `pd.DataFrame` and `np.ndarray`\n",
- "\n",
- "If the `terms` only refer to a few columns of the data frame, the `transform` method only needs a dataframe with those columns.\n",
- "\n",
- "If we had used an `np.ndarray`, the column identifiers would be integers identifying specific columns so,\n",
- "in order to work correctly, `transform` would need another `np.ndarray` where the columns have the same meaning."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "a5926ec9",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([[1.0, 120, 73],\n",
- " [1.0, 83, 48],\n",
- " [1.0, 80, 35],\n",
- " [1.0, 97, 100]], dtype=object)"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "Carseats_np = np.asarray(Carseats[['Price', 'ShelveLoc', 'US', 'Income']])\n",
- "design_np = ModelSpec([0,3]).fit(Carseats_np)\n",
- "design_np.transform(Carseats_np)[:4]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "997a63cb",
- "metadata": {},
- "source": [
- "The following will fail for hopefully obvious reasons"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "40410c48",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shapes (2,2) and (3,) not aligned: 2 (dim 1) != 3 (dim 0)\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_D = np.zeros((2,2))\n",
- " new_D[:,0] = [10,20]\n",
- " new_D[:,1] = [40,50]\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except ValueError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "920203e9",
- "metadata": {},
- "source": [
- "Ultimately, `M` expects 3 columns for new predictions because it was fit\n",
- "with a matrix having 3 columns (the first representing an intercept).\n",
- "\n",
- "We might be tempted to try as with the `pd.DataFrame` and produce\n",
- "an `np.ndarray` with only the necessary variables."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "1061da77",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "index 3 is out of bounds for axis 1 with size 2\n"
- ]
- }
- ],
- "source": [
- "try:\n",
- " new_X = np.zeros((2,2))\n",
- " new_X[:,0] = [10,20]\n",
- " new_X[:,1] = [40,50]\n",
- " new_D = design_np.transform(new_X)\n",
- " M.get_prediction(new_D).predicted_mean\n",
- "except IndexError as e:\n",
- " print(e)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c6bfe001",
- "metadata": {},
- "source": [
- "This fails because `design_np` is looking for column `3` from its `terms`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "5ae6d25f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Variable(variables=(0,), name='0', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False),\n",
- " Variable(variables=(3,), name='3', encoder=None, use_transform=True, pure_columns=True, override_encoder_colnames=False)]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design_np.terms_"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "edd7ebeb",
- "metadata": {},
- "source": [
- "However, if we have an `np.ndarray` in which the first column indeed represents `Price` and the fourth indeed\n",
- "represents `Income` then we can arrive at the correct answer by supplying such the array to `design_np.transform`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "9455e532",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([12.65257604, 12.25873428])"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "new_X = np.zeros((2,4))\n",
- "new_X[:,0] = [10,20]\n",
- "new_X[:,3] = [40,50]\n",
- "new_D = design_np.transform(new_X)\n",
- "M.get_prediction(new_D).predicted_mean"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fd726791",
- "metadata": {},
- "source": [
- "Given this subtlety about needing to supply arrays with identical column structure to `transform` when\n",
- "using `np.ndarray` we presume that using a `pd.DataFrame` will be the more popular use case."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "967d9ebc",
- "metadata": {},
- "source": [
- "## A model with some categorical variables\n",
- "\n",
- "Categorical variables become `Column` instances with encoders."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "d0429b56",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', 'UIncome', 'ShelveLoc']).fit(Carseats)\n",
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "415e3fd0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[L]', 'UIncome[M]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "id": "8a99c3a5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 11.876012\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[L] -1.042297\n",
- "UIncome[M] -0.119123\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "9250a28a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fe90c12c",
- "metadata": {},
- "source": [
- "## Getting the encoding you want\n",
- "\n",
- "By default the level dropped by `ModelSpec` will be the first of the `categories_` values from \n",
- "`sklearn.preprocessing.OneHotEncoder()`. We might wish to change this. It seems\n",
- "as if the correct way to do this would be something like `Variable(('UIncome',), 'mynewencoding', new_encoder)`\n",
- "where `new_encoder` would somehow drop the column we want dropped. \n",
- "\n",
- "However, when using the convenient identifier `UIncome` in the `variables` argument, this maps to the `Column` associated to `UIncome` within `design.column_info_`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "0546ec84",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Column(idx='UIncome', name='UIncome', is_categorical=True, is_ordinal=False, columns=('UIncome[L]', 'UIncome[M]'), encoder=Contrast())"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.column_info_['UIncome']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6ec4fe65",
- "metadata": {},
- "source": [
- "This column already has an encoder and `Column` instances are immutable as named tuples. Further, there are times when \n",
- "we may want to encode `UIncome` differently within the same model. In the model below the main effect of `UIncome` is encoded with two columns while in the interaction `UIncome` (see below) has three columns. This is a design of interest\n",
- "and we need a way to allow different encodings of the same column of `Carseats`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "61e7f56e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "802ed854",
- "metadata": {},
- "source": [
- " We can create a new \n",
- "`Column` with the encoder we want. For categorical variables, there is a convenience function to do so."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "82d7a01d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from ISLP.models.model_spec import contrast\n",
- "pref_encoding = contrast('UIncome', 'drop', 'L')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "e26849a1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[M] UIncome[H]\n",
- " 0 1.0 0.0\n",
- " 1 0.0 0.0\n",
- " 2 0.0 0.0\n",
- " 3 0.0 1.0\n",
- " 4 1.0 0.0\n",
- " .. ... ...\n",
- " 395 0.0 1.0\n",
- " 396 0.0 0.0\n",
- " 397 0.0 0.0\n",
- " 398 1.0 0.0\n",
- " 399 0.0 0.0\n",
- " \n",
- " [400 rows x 2 columns],\n",
- " ['UIncome[M]', 'UIncome[H]'])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, pref_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "2fc4cd8c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['intercept', 'Population', 'Price', 'UIncome[M]', 'UIncome[H]',\n",
- " 'ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['Population', 'Price', pref_encoding, 'ShelveLoc']).fit(Carseats)\n",
- "X = design.fit_transform(Carseats)\n",
- "X.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "49e33d41",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 10.833715\n",
- "Population 0.001163\n",
- "Price -0.055725\n",
- "UIncome[M] 0.923174\n",
- "UIncome[H] 1.042297\n",
- "ShelveLoc[Good] 4.999623\n",
- "ShelveLoc[Medium] 1.964278\n",
- "dtype: float64"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "ce018fdf",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) Population Price UIncomeM UIncomeH \n",
- " 10.83371503 0.00116301 -0.05572469 0.92317388 1.04229679 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.99962319 1.96427771 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ Population + Price + UIncome + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2d42b822",
- "metadata": {},
- "source": [
- "## Interactions\n",
- "\n",
- "We've referred to interactions above. These are specified (by convenience) as tuples in the `terms` argument\n",
- "to `ModelSpec`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "fbb3e3ba",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 7.866634\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "UIncome[L] -2.734895\n",
- "UIncome[M] -2.619745\n",
- "dtype: float64"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([('UIncome', 'ShelveLoc'), 'UIncome'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f9a7d4ad",
- "metadata": {},
- "source": [
- "The tuples in `terms` are converted to `Variable` in the formalized `terms_` attribute by creating a `Variable` with\n",
- "`variables` set to the tuple and the encoder an `Interaction` encoder which (unsurprisingly) creates the interaction columns from the concatenated data frames of `UIncome` and `ShelveLoc`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "5a6f8e69",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('UIncome', 'ShelveLoc'), name='UIncome:ShelveLoc', encoder=Interaction(column_names={'ShelveLoc': ['ShelveLoc[Good]', 'ShelveLoc[Medium]'],\n",
- " 'UIncome': ['UIncome[L]', 'UIncome[M]']},\n",
- " columns={'ShelveLoc': range(2, 4), 'UIncome': range(0, 2)},\n",
- " variables=['UIncome', 'ShelveLoc']), use_transform=True, pure_columns=False, override_encoder_colnames=False)"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.terms_[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "98eef5c8",
- "metadata": {},
- "source": [
- "Comparing this to the previous `R` model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "58c99601",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ UIncome:ShelveLoc + UIncome, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) UIncomeM UIncomeH \n",
- " 5.1317 0.1151 1.1561 \n",
- " UIncomeL:ShelveLocGood UIncomeM:ShelveLocGood UIncomeH:ShelveLocGood \n",
- " 4.5121 5.5752 3.7381 \n",
- "UIncomeL:ShelveLocMedium UIncomeM:ShelveLocMedium UIncomeH:ShelveLocMedium \n",
- " 1.2473 2.4782 1.5141 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9c979d7e",
- "metadata": {},
- "source": [
- "We note a few important things:\n",
- "\n",
- "1. `R` has reorganized the columns of the design from the formula: although we wrote `UIncome:ShelveLoc` first these\n",
- "columns have been built later. **`ModelSpec` builds columns in the order determined by `terms`!**\n",
- "\n",
- "2. As noted above, `R` has encoded `UIncome` differently in the main effect and in the interaction. For `ModelSpec`, the reference to `UIncome` always refers to the column in `design.column_info_` and will always build only the columns for `L` and `M`. **`ModelSpec` does no inspection of terms to decide how to encode categorical variables.**\n",
- "\n",
- "A few notes:\n",
- "\n",
- "- **Why not try to inspect the terms?** For any nontrivial formula in `R` with several categorical variables and interactions, predicting what columns will be produced from a given formula is not simple. **`ModelSpec` errs on the side of being explicit.**\n",
- "\n",
- "- **Is it impossible to build the design as `R` has?** No. An advanced user who *knows* they want the columns built as `R` has can do so (fairly) easily."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "0cb3b63a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "( UIncome[H] UIncome[L] UIncome[M]\n",
- " 0 0.0 0.0 1.0\n",
- " 1 0.0 1.0 0.0\n",
- " 2 0.0 1.0 0.0\n",
- " 3 1.0 0.0 0.0\n",
- " 4 0.0 0.0 1.0\n",
- " .. ... ... ...\n",
- " 395 1.0 0.0 0.0\n",
- " 396 0.0 1.0 0.0\n",
- " 397 0.0 1.0 0.0\n",
- " 398 0.0 0.0 1.0\n",
- " 399 0.0 1.0 0.0\n",
- " \n",
- " [400 rows x 3 columns],\n",
- " ['UIncome[H]', 'UIncome[L]', 'UIncome[M]'])"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "full_encoding = contrast('UIncome', None)\n",
- "design.build_columns(Carseats, full_encoding)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "id": "272098d7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.131739\n",
- "UIncome[M] 0.115150\n",
- "UIncome[H] 1.156118\n",
- "UIncome[H]:ShelveLoc[Good] 3.738052\n",
- "UIncome[H]:ShelveLoc[Medium] 1.514104\n",
- "UIncome[L]:ShelveLoc[Good] 4.512054\n",
- "UIncome[L]:ShelveLoc[Medium] 1.247275\n",
- "UIncome[M]:ShelveLoc[Good] 5.575170\n",
- "UIncome[M]:ShelveLoc[Medium] 2.478163\n",
- "dtype: float64"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fe05c471",
- "metadata": {},
- "source": [
- "## Special encodings\n",
- "\n",
- "For flexible models, we may want to consider transformations of features, i.e. polynomial\n",
- "or spline transformations. Given transforms that follow the `fit/transform` paradigm\n",
- "we can of course achieve this with a `Column` and an `encoder`. The `ISLP.transforms`\n",
- "package includes a `Poly` transform"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "id": "67062299",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Variable(variables=('Income',), name='poly(Income, 3, )', encoder=Poly(degree=3), use_transform=True, pure_columns=False, override_encoder_colnames=True)"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import poly\n",
- "poly('Income', 3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "df5e5b4d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 5.440077\n",
- "poly(Income, 3, )[0] 10.036373\n",
- "poly(Income, 3, )[1] -2.799156\n",
- "poly(Income, 3, )[2] 2.399601\n",
- "ShelveLoc[Good] 4.808133\n",
- "ShelveLoc[Medium] 1.889533\n",
- "dtype: float64"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([poly('Income', 3), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "01be9c13",
- "metadata": {},
- "source": [
- "Compare:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "3244d6f6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) poly(Income, 3)1 poly(Income, 3)2 poly(Income, 3)3 \n",
- " 5.440077 10.036373 -2.799156 2.399601 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.808133 1.889533 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ poly(Income, 3) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8ad5bb1d",
- "metadata": {},
- "source": [
- "## Splines\n",
- "\n",
- "Support for natural and B-splines is also included"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "6a6f4358",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 4.240421\n",
- "ns(Income, , df=5)[0] 1.468196\n",
- "ns(Income, , df=5)[1] 1.499471\n",
- "ns(Income, , df=5)[2] 1.152070\n",
- "ns(Income, , df=5)[3] 2.418398\n",
- "ns(Income, , df=5)[4] 1.804460\n",
- "ShelveLoc[Good] 4.810449\n",
- "ShelveLoc[Medium] 1.881095\n",
- "dtype: float64"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from ISLP.models.model_spec import ns, bs, pca\n",
- "design = ModelSpec([ns('Income', df=5), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "fb740953",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) ns(Income, df = 5)1 ns(Income, df = 5)2 ns(Income, df = 5)3 \n",
- " 4.240421 1.468196 1.499471 1.152070 \n",
- "ns(Income, df = 5)4 ns(Income, df = 5)5 ShelveLocGood ShelveLocMedium \n",
- " 2.418398 1.804460 4.810449 1.881095 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "library(splines)\n",
- "lm(Sales ~ ns(Income, df=5) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "fe1bf7fe",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 3.495085\n",
- "bs(Income, , df=7, degree=2)[0] 1.813118\n",
- "bs(Income, , df=7, degree=2)[1] 0.961852\n",
- "bs(Income, , df=7, degree=2)[2] 2.471545\n",
- "bs(Income, , df=7, degree=2)[3] 2.158891\n",
- "bs(Income, , df=7, degree=2)[4] 2.091625\n",
- "bs(Income, , df=7, degree=2)[5] 2.600669\n",
- "bs(Income, , df=7, degree=2)[6] 2.843108\n",
- "ShelveLoc[Good] 4.804919\n",
- "ShelveLoc[Medium] 1.880337\n",
- "dtype: float64"
- ]
- },
- "execution_count": 51,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([bs('Income', df=7, degree=2), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "86e966e0",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " (Intercept) bs(Income, df = 7, degree = 2)1 \n",
- " 3.4950851 1.8131176 \n",
- "bs(Income, df = 7, degree = 2)2 bs(Income, df = 7, degree = 2)3 \n",
- " 0.9618523 2.4715450 \n",
- "bs(Income, df = 7, degree = 2)4 bs(Income, df = 7, degree = 2)5 \n",
- " 2.1588908 2.0916252 \n",
- "bs(Income, df = 7, degree = 2)6 bs(Income, df = 7, degree = 2)7 \n",
- " 2.6006694 2.8431084 \n",
- " ShelveLocGood ShelveLocMedium \n",
- " 4.8049190 1.8803375 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ bs(Income, df=7, degree=2) + ShelveLoc, data=Carseats)$coef"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "877d4784",
- "metadata": {},
- "source": [
- "## PCA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "8ba6cb20",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.419405\n",
- "pca(myvars, , n_components=2)[0] -0.001131\n",
- "pca(myvars, , n_components=2)[1] -0.024217\n",
- "ShelveLoc[Good] 4.816253\n",
- "ShelveLoc[Medium] 1.924139\n",
- "dtype: float64"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars'), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "f0319e51",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population))$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.419405 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC1 \n",
- " 0.001131 \n",
- "prcomp(cbind(Income, Price, Advertising, Population))$x[, 1:2]PC2 \n",
- " -0.024217 \n",
- " ShelveLocGood \n",
- " 4.816253 \n",
- " ShelveLocMedium \n",
- " 1.924139 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population))$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1f55086a",
- "metadata": {},
- "source": [
- "It is of course common to scale before running PCA."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "id": "bbe9e004",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "intercept 5.352159\n",
- "pca(myvars, , n_components=2)[0] 0.446383\n",
- "pca(myvars, , n_components=2)[1] -1.219788\n",
- "ShelveLoc[Good] 4.922780\n",
- "ShelveLoc[Medium] 2.005617\n",
- "dtype: float64"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pca(['Income', \n",
- " 'Price', \n",
- " 'Advertising', \n",
- " 'Population'], \n",
- " n_components=2, \n",
- " name='myvars',\n",
- " scale=True), 'ShelveLoc'])\n",
- "X = design.fit_transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "id": "d78c02e4",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ prcomp(cbind(Income, Price, Advertising, \n",
- " Population), scale = TRUE)$x[, 1:2] + ShelveLoc, data = Carseats)\n",
- "\n",
- "Coefficients:\n",
- " (Intercept) \n",
- " 5.3522 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC1 \n",
- " 0.4469 \n",
- "prcomp(cbind(Income, Price, Advertising, Population), scale = TRUE)$x[, 1:2]PC2 \n",
- " -1.2213 \n",
- " ShelveLocGood \n",
- " 4.9228 \n",
- " ShelveLocMedium \n",
- " 2.0056 \n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "lm(Sales ~ prcomp(cbind(Income, Price, Advertising, Population), scale=TRUE)$x[,1:2] + ShelveLoc, data=Carseats)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8a03c603",
- "metadata": {},
- "source": [
- "There will be some small differences in the coefficients due to `sklearn` use of `np.std(ddof=0)` instead\n",
- "of `np.std(ddof=1)`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "f8215cef",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([ 0.44694166, -1.22131519])"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "np.array(sm.OLS(Y, X).fit().params)[1:3] * np.sqrt(X.shape[0] / (X.shape[0]-1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a15d0ead",
- "metadata": {},
- "source": [
- "## Submodels\n",
- "\n",
- "We can build submodels as well, even if the terms do not appear in the original `terms` argument.\n",
- "Fundamentally, the terms just need to be able to have the `design.build_columns` work for us to be\n",
- "able to build a design matrix. The initial inspection of the columns of `Carseats` has created\n",
- "a column for `US`, hence we can build this submodel."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "id": "d58c6244",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " intercept | \n",
- " US[Yes] | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 2 columns
\n",
- "
"
- ],
- "text/plain": [
- " intercept US[Yes]\n",
- "0 1.0 1.0\n",
- "1 1.0 1.0\n",
- "2 1.0 1.0\n",
- "3 1.0 1.0\n",
- "4 1.0 0.0\n",
- ".. ... ...\n",
- "395 1.0 1.0\n",
- "396 1.0 1.0\n",
- "397 1.0 1.0\n",
- "398 1.0 1.0\n",
- "399 1.0 1.0\n",
- "\n",
- "[400 rows x 2 columns]"
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec(['UIncome', 'ShelveLoc', 'Price']).fit(Carseats)\n",
- "design.build_submodel(Carseats, ['US'])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9365ba27",
- "metadata": {},
- "source": [
- "## ANOVA \n",
- "\n",
- "For a given `terms` argument, there as a natural sequence of models, namely those specified by `[terms[:i] for i in range(len(terms)+1]`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "id": "332ab454",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['intercept'], dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]'], dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price'], dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n",
- " 'UIncome[L]', 'UIncome[M]'],\n",
- " dtype='object')\n",
- "Index(['intercept', 'ShelveLoc[Good]', 'ShelveLoc[Medium]', 'Price',\n",
- " 'UIncome[L]', 'UIncome[M]', 'US[Yes]'],\n",
- " dtype='object')\n"
- ]
- }
- ],
- "source": [
- "design = ModelSpec(['ShelveLoc', 'Price', 'UIncome', 'US']).fit(Carseats)\n",
- "for D in design.build_sequence(Carseats):\n",
- " print(D.columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "id": "f6cfd031",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 397.0 | \n",
- " 2172.743555 | \n",
- " 2.0 | \n",
- " 1009.531143 | \n",
- " 153.010858 | \n",
- " 5.452815e-50 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 396.0 | \n",
- " 1455.640702 | \n",
- " 1.0 | \n",
- " 717.102853 | \n",
- " 217.377192 | \n",
- " 1.583751e-39 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 394.0 | \n",
- " 1378.915938 | \n",
- " 2.0 | \n",
- " 76.724764 | \n",
- " 11.628885 | \n",
- " 1.239031e-05 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 393.0 | \n",
- " 1296.462700 | \n",
- " 1.0 | \n",
- " 82.453238 | \n",
- " 24.994257 | \n",
- " 8.678832e-07 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 397.0 2172.743555 2.0 1009.531143 153.010858 5.452815e-50\n",
- "2 396.0 1455.640702 1.0 717.102853 217.377192 1.583751e-39\n",
- "3 394.0 1378.915938 2.0 76.724764 11.628885 1.239031e-05\n",
- "4 393.0 1296.462700 1.0 82.453238 24.994257 8.678832e-07"
- ]
- },
- "execution_count": 60,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "11c4aee8",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "ShelveLoc 2 1009.53 504.77 153.011 < 2.2e-16 ***\n",
- "Price 1 717.10 717.10 217.377 < 2.2e-16 ***\n",
- "UIncome 2 76.72 38.36 11.629 1.240e-05 ***\n",
- "US 1 82.45 82.45 24.994 8.679e-07 ***\n",
- "Residuals 393 1296.46 3.30 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ ShelveLoc + Price + UIncome + US, data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9a4e6e63",
- "metadata": {},
- "source": [
- "Recall that `ModelSpec` does not inspect `terms` to reorder based on degree of \n",
- "interaction as `R` does:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "id": "6e7bf361",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 393.0 | \n",
- " 2059.376413 | \n",
- " 6.0 | \n",
- " 1122.898284 | \n",
- " 35.940047 | \n",
- " 1.175738e-34 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 391.0 | \n",
- " 2036.044596 | \n",
- " 2.0 | \n",
- " 23.331817 | \n",
- " 2.240310 | \n",
- " 1.077900e-01 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 393.0 2059.376413 6.0 1122.898284 35.940047 1.175738e-34\n",
- "2 391.0 2036.044596 2.0 23.331817 2.240310 1.077900e-01"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([(full_encoding, 'ShelveLoc'), pref_encoding]).fit(Carseats)\n",
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats) ))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "id": "ed7d4bfa",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "UIncome 2 61.92 30.962 5.9458 0.002859 ** \n",
- "UIncome:ShelveLoc 6 1084.31 180.718 34.7049 < 2.2e-16 ***\n",
- "Residuals 391 2036.04 5.207 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ UIncome:ShelveLoc + UIncome, data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0350da34",
- "metadata": {},
- "source": [
- "To agree with `R` we must order `terms` as `R` will."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "id": "5ddaf87c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 397.0 | \n",
- " 3120.351382 | \n",
- " 2.0 | \n",
- " 61.923316 | \n",
- " 5.945846 | \n",
- " 2.855424e-03 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 391.0 | \n",
- " 2036.044596 | \n",
- " 6.0 | \n",
- " 1084.306785 | \n",
- " 34.704868 | \n",
- " 1.346561e-33 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 397.0 3120.351382 2.0 61.923316 5.945846 2.855424e-03\n",
- "2 391.0 2036.044596 6.0 1084.306785 34.704868 1.346561e-33"
- ]
- },
- "execution_count": 64,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design = ModelSpec([pref_encoding, (full_encoding, 'ShelveLoc')]).fit(Carseats)\n",
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1ef70ce3",
- "metadata": {},
- "source": [
- "## More complicated interactions\n",
- "\n",
- "Can we have an interaction of a polynomial effect with a categorical? Absolutely"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 65,
- "id": "a1a14742",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "UIncome 2 61.92 30.9617 4.0310 0.01851 *\n",
- "UIncome:poly(Income, 3) 9 79.72 8.8581 1.1533 0.32408 \n",
- "UIncome:US 3 83.51 27.8367 3.6242 0.01324 *\n",
- "Residuals 385 2957.12 7.6808 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ UIncome + poly(Income, 3):UIncome + UIncome:US, data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a909be1a",
- "metadata": {},
- "source": [
- "To match `R` we note that it has used its inspection rules to encode `UIncome` with 3 levels\n",
- "for the two interactions."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "id": "ae286cf3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "intercept 65.978856\n",
- "UIncome[M] -60.159607\n",
- "UIncome[H] -147.276154\n",
- "poly(Income, 3, )[0]:UIncome[H] 1957.694387\n",
- "poly(Income, 3, )[0]:UIncome[L] 1462.060650\n",
- "poly(Income, 3, )[0]:UIncome[M] 83.035153\n",
- "poly(Income, 3, )[1]:UIncome[H] -984.494570\n",
- "poly(Income, 3, )[1]:UIncome[L] 881.537647\n",
- "poly(Income, 3, )[1]:UIncome[M] -18.006234\n",
- "poly(Income, 3, )[2]:UIncome[H] 207.614692\n",
- "poly(Income, 3, )[2]:UIncome[L] 217.190749\n",
- "poly(Income, 3, )[2]:UIncome[M] 34.065434\n",
- "UIncome[H]:US 0.903404\n",
- "UIncome[L]:US 0.895538\n",
- "UIncome[M]:US 1.048728\n",
- "dtype: float64"
- ]
- },
- "execution_count": 66,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "p3 = poly('Income', 3)\n",
- "design = ModelSpec([pref_encoding, (p3, full_encoding), (full_encoding, 'US')]).fit(Carseats)\n",
- "X = design.transform(Carseats)\n",
- "sm.OLS(Y, X).fit().params"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "id": "236ab2d2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 397.0 | \n",
- " 3120.351382 | \n",
- " 2.0 | \n",
- " 61.923316 | \n",
- " 4.031032 | \n",
- " 0.018488 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 388.0 | \n",
- " 3040.628559 | \n",
- " 9.0 | \n",
- " 79.722823 | \n",
- " 1.153273 | \n",
- " 0.324049 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 385.0 | \n",
- " 2957.118444 | \n",
- " 3.0 | \n",
- " 83.510115 | \n",
- " 3.624181 | \n",
- " 0.013244 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 397.0 3120.351382 2.0 61.923316 4.031032 0.018488\n",
- "2 388.0 3040.628559 9.0 79.722823 1.153273 0.324049\n",
- "3 385.0 2957.118444 3.0 83.510115 3.624181 0.013244"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0a45c720",
- "metadata": {},
- "source": [
- "## Grouping columns for ANOVA\n",
- "\n",
- "The `Variable` construct can be used to group\n",
- "variables together to get custom sequences of models for `anova_lm`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 68,
- "id": "f36c1b3b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['intercept'], dtype='object')\n",
- "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]'], dtype='object')\n",
- "Index(['intercept', 'Price', 'UIncome[M]', 'UIncome[H]', 'US[Yes]',\n",
- " 'Advertising'],\n",
- " dtype='object')\n"
- ]
- }
- ],
- "source": [
- "group1 = Variable(('Price', pref_encoding), 'group1', None)\n",
- "group2 = Variable(('US', 'Advertising'), 'group2', None)\n",
- "design = ModelSpec([group1, group2]).fit(Carseats)\n",
- "for D in design.build_sequence(Carseats):\n",
- " print(D.columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 69,
- "id": "3daf7638",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " df_resid | \n",
- " ssr | \n",
- " df_diff | \n",
- " ss_diff | \n",
- " F | \n",
- " Pr(>F) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 399.0 | \n",
- " 3182.274698 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 396.0 | \n",
- " 2508.187788 | \n",
- " 3.0 | \n",
- " 674.086910 | \n",
- " 39.304841 | \n",
- " 2.970412e-22 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 394.0 | \n",
- " 2252.396343 | \n",
- " 2.0 | \n",
- " 255.791445 | \n",
- " 22.372135 | \n",
- " 6.267562e-10 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 399.0 3182.274698 0.0 NaN NaN NaN\n",
- "1 396.0 2508.187788 3.0 674.086910 39.304841 2.970412e-22\n",
- "2 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10"
- ]
- },
- "execution_count": 69,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sm.stats.anova_lm(*(sm.OLS(Y, D).fit() for D in design.build_sequence(Carseats)))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "46c1ace8",
- "metadata": {},
- "source": [
- "It is not clear this is simple to do in `R` as the formula object expands all parentheses."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "id": "0b87e430",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Response: Sales\n",
- " Df Sum Sq Mean Sq F value Pr(>F) \n",
- "Price 1 630.03 630.03 110.2079 < 2.2e-16 ***\n",
- "UIncome 2 44.06 22.03 3.8533 0.02201 * \n",
- "US 1 121.88 121.88 21.3196 5.270e-06 ***\n",
- "Advertising 1 133.91 133.91 23.4247 1.868e-06 ***\n",
- "Residuals 394 2252.40 5.72 \n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "anova(lm(Sales ~ (Price + UIncome) + (US + Advertising), data=Carseats))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7c137360",
- "metadata": {},
- "source": [
- "It can be done by building up the models\n",
- "by hand and likely is possible to be done programmatically but it seems not obvious."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "id": "b678d323",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ 1\n",
- "Model 2: Sales ~ Price + UIncome\n",
- "Model 3: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F) \n",
- "1 399 3182.3 \n",
- "2 396 2508.2 3 674.09 39.305 < 2.2e-16 ***\n",
- "3 394 2252.4 2 255.79 22.372 6.268e-10 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "M1 = lm(Sales ~ 1, data=Carseats)\n",
- "M2 = lm(Sales ~ Price + UIncome, data=Carseats)\n",
- "M3 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n",
- "anova(M1, M2, M3)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b0388949",
- "metadata": {},
- "source": [
- "## Alternative anova\n",
- "\n",
- "Another common ANOVA table involves dropping each term in succession from the model and comparing\n",
- "to the full model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 72,
- "id": "ac5b916a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'intercept'}\n",
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 395.0 4417.273517 0.0 NaN NaN NaN\n",
- "1 394.0 2252.396343 1.0 2164.877175 378.690726 1.359177e-59\n",
- "{'Price', 'UIncome[H]', 'UIncome[M]'}\n",
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 397.0 2950.808154 0.0 NaN NaN NaN\n",
- "1 394.0 2252.396343 3.0 698.411811 40.723184 6.077848e-23\n",
- "{'US[Yes]', 'Advertising'}\n",
- " df_resid ssr df_diff ss_diff F Pr(>F)\n",
- "0 396.0 2508.187788 0.0 NaN NaN NaN\n",
- "1 394.0 2252.396343 2.0 255.791445 22.372135 6.267562e-10\n"
- ]
- }
- ],
- "source": [
- "Dfull = design.transform(Carseats)\n",
- "Mfull = sm.OLS(Y, Dfull).fit()\n",
- "for i, D in enumerate(design.build_sequence(Carseats, anova_type='drop')):\n",
- " if i == 0:\n",
- " D0 = D\n",
- " print(set(D.columns) ^ set(Dfull.columns))\n",
- " print(sm.stats.anova_lm(sm.OLS(Y, D).fit(), Mfull))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 73,
- "id": "a0c71948",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ US + Advertising\n",
- "Model 2: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F) \n",
- "1 397 2950.8 \n",
- "2 394 2252.4 3 698.41 40.723 < 2.2e-16 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ Price + UIncome\n",
- "Model 2: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F) \n",
- "1 396 2508.2 \n",
- "2 394 2252.4 2 255.79 22.372 6.268e-10 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n",
- "M2 = lm(Sales ~ US + Advertising, data=Carseats)\n",
- "print(anova(M2, M1))\n",
- "M3 = lm(Sales ~ Price + UIncome, data=Carseats)\n",
- "print(anova(M3, M1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a5e4880d",
- "metadata": {},
- "source": [
- "The comparison without the intercept here is actually very hard to achieve in `R` with `anova` due to its inspection\n",
- "of the formula."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 74,
- "id": "4b383401",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analysis of Variance Table\n",
- "\n",
- "Model 1: Sales ~ Price + UIncome + US + Advertising - 1\n",
- "Model 2: Sales ~ Price + UIncome + US + Advertising\n",
- " Res.Df RSS Df Sum of Sq F Pr(>F)\n",
- "1 394 2252.4 \n",
- "2 394 2252.4 0 9.0949e-13 \n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "M1 = lm(Sales ~ Price + UIncome + US + Advertising, data=Carseats)\n",
- "M4 = lm(Sales ~ Price + UIncome + US + Advertising - 1, data=Carseats)\n",
- "print(anova(M4, M1))"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "72d7c83b",
- "metadata": {},
- "source": [
- "It can be found with `summary`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 75,
- "id": "4d5ce789",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Call:\n",
- "lm(formula = Sales ~ Price + UIncome + US + Advertising, data = Carseats)\n",
- "\n",
- "Residuals:\n",
- " Min 1Q Median 3Q Max \n",
- "-7.4437 -1.6351 -0.0932 1.4920 6.8076 \n",
- "\n",
- "Coefficients:\n",
- " Estimate Std. Error t value Pr(>|t|) \n",
- "(Intercept) 12.520356 0.643390 19.460 < 2e-16 ***\n",
- "Price -0.054000 0.005072 -10.647 < 2e-16 ***\n",
- "UIncomeM 0.548906 0.281693 1.949 0.0521 . \n",
- "UIncomeH 0.708219 0.322028 2.199 0.0284 * \n",
- "USYes 0.024181 0.343246 0.070 0.9439 \n",
- "Advertising 0.119509 0.024692 4.840 1.87e-06 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
- "\n",
- "Residual standard error: 2.391 on 394 degrees of freedom\n",
- "Multiple R-squared: 0.2922,\tAdjusted R-squared: 0.2832 \n",
- "F-statistic: 32.53 on 5 and 394 DF, p-value: < 2.2e-16\n",
- "\n"
- ]
- }
- ],
- "source": [
- "%%R\n",
- "summary(M1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
- "id": "56b82d02",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(378.690726, 378.69160000000005)"
- ]
- },
- "execution_count": 76,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "378.690726, 19.46**2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "872f645c-1d6f-4d08-9eec-2b80276bc82c",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "jupytext": {
- "formats": "source/models///ipynb,jupyterbook/models///md:myst,jupyterbook/models///ipynb"
- },
- "kernelspec": {
- "display_name": "python3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/source/transforms/PCA.ipynb b/docs/source/transforms/PCA.ipynb
index 224992b..ec1e0ae 100644
--- a/docs/source/transforms/PCA.ipynb
+++ b/docs/source/transforms/PCA.ipynb
@@ -19,9 +19,14 @@
"outputs": [],
"source": [
"import numpy as np\n",
+ "from sklearn.decomposition import PCA\n",
+ "\n",
"from ISLP import load_data\n",
- "from ISLP.models import ModelSpec, pca, Variable, derived_variable\n",
- "from sklearn.decomposition import PCA"
+ "from ISLP.models import (ModelSpec, \n",
+ " pca, \n",
+ " Feature, \n",
+ " derived_feature,\n",
+ " build_columns)"
]
},
{
@@ -71,7 +76,7 @@
"id": "fff603bf",
"metadata": {},
"source": [
- "Suppose we want to make a `Variable` representing the first 3 principal components of the\n",
+ "Suppose we want to make a `Feature` representing the first 3 principal components of the\n",
" features `['CompPrice', 'Income', 'Advertising', 'Population', 'Price']`."
]
},
@@ -80,8 +85,8 @@
"id": "eab49ad1-3957-478f-8a76-28a8f58551e9",
"metadata": {},
"source": [
- "We first make a `Variable` that represents these five features columns, then `pca`\n",
- "can be used to compute a new `Variable` that returns the first three principal components."
+ "We first make a `Feature` that represents these five features columns, then `pca`\n",
+ "can be used to compute a new `Feature` that returns the first three principal components."
]
},
{
@@ -91,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
- "grouped = Variable(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n",
+ "grouped = Feature(('CompPrice', 'Income', 'Advertising', 'Population', 'Price'), name='grouped', encoder=None)\n",
"sklearn_pca = PCA(n_components=3, whiten=True)"
]
},
@@ -100,7 +105,7 @@
"id": "b45655a3-393d-4b4c-b754-cda61ed0e014",
"metadata": {},
"source": [
- "We can now fit `sklearn_pca` and create our new variable."
+ "We can now fit `sklearn_pca` and create our new feature."
]
},
{
@@ -108,175 +113,18 @@
"execution_count": 5,
"id": "6cfe8861-ad07-47b9-95d1-5d5513ff6fbe",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "sklearn_pca.fit(design.build_columns(Carseats, grouped)[0]) \n",
- "pca_var = derived_variable(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n",
+ "grouped_features = build_columns(design.column_info_,\n",
+ " Carseats,\n",
+ " grouped)[0]\n",
+ "sklearn_pca.fit(grouped_features) \n",
+ "pca_var = derived_feature(['CompPrice', 'Income', 'Advertising', 'Population', 'Price'],\n",
" name='pca(grouped)', encoder=sklearn_pca)\n",
- "derived_features, _ = design.build_columns(Carseats, pca_var)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "aeb47184-9e15-4a6e-b60a-916f5ff89063",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " CompPrice | \n",
- " Income | \n",
- " Advertising | \n",
- " Population | \n",
- " Price | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 138 | \n",
- " 73 | \n",
- " 11 | \n",
- " 276 | \n",
- " 120 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 111 | \n",
- " 48 | \n",
- " 16 | \n",
- " 260 | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 113 | \n",
- " 35 | \n",
- " 10 | \n",
- " 269 | \n",
- " 80 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 117 | \n",
- " 100 | \n",
- " 4 | \n",
- " 466 | \n",
- " 97 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 141 | \n",
- " 64 | \n",
- " 3 | \n",
- " 340 | \n",
- " 128 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 395 | \n",
- " 138 | \n",
- " 108 | \n",
- " 17 | \n",
- " 203 | \n",
- " 128 | \n",
- "
\n",
- " \n",
- " | 396 | \n",
- " 139 | \n",
- " 23 | \n",
- " 3 | \n",
- " 37 | \n",
- " 120 | \n",
- "
\n",
- " \n",
- " | 397 | \n",
- " 162 | \n",
- " 26 | \n",
- " 12 | \n",
- " 368 | \n",
- " 159 | \n",
- "
\n",
- " \n",
- " | 398 | \n",
- " 100 | \n",
- " 79 | \n",
- " 7 | \n",
- " 284 | \n",
- " 95 | \n",
- "
\n",
- " \n",
- " | 399 | \n",
- " 134 | \n",
- " 37 | \n",
- " 0 | \n",
- " 27 | \n",
- " 120 | \n",
- "
\n",
- " \n",
- "
\n",
- "
400 rows × 5 columns
\n",
- "
"
- ],
- "text/plain": [
- " CompPrice Income Advertising Population Price\n",
- "0 138 73 11 276 120\n",
- "1 111 48 16 260 83\n",
- "2 113 35 10 269 80\n",
- "3 117 100 4 466 97\n",
- "4 141 64 3 340 128\n",
- ".. ... ... ... ... ...\n",
- "395 138 108 17 203 128\n",
- "396 139 23 3 37 120\n",
- "397 162 26 12 368 159\n",
- "398 100 79 7 284 95\n",
- "399 134 37 0 27 120\n",
- "\n",
- "[400 rows x 5 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "design.build_columns(Carseats, grouped)[0]"
+ "derived_features, _ = build_columns(design.column_info_,\n",
+ " Carseats, \n",
+ " pca_var,\n",
+ " encoders=design.encoders_)"
]
},
{
@@ -291,7 +139,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "9f4b0955",
"metadata": {},
"outputs": [],
@@ -304,22 +152,10 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"id": "6b382699-eb86-457f-8e91-09a63eb21d49",
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n",
- "/Users/jonathantaylor/miniconda3/envs/islp_test/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names\n",
- " warnings.warn(\n"
- ]
- },
{
"data": {
"text/plain": [
@@ -329,7 +165,7 @@
" dtype='object')"
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -350,7 +186,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"id": "4a8d9b28",
"metadata": {},
"outputs": [],
@@ -361,7 +197,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 13,
"id": "6efa6c67-86e1-4f51-86c2-25c838a90bf4",
"metadata": {},
"outputs": [
@@ -371,7 +207,7 @@
"(4.073428490498941e-14, 0.0)"
]
},
- "execution_count": 10,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/docs/source/transforms/poly.ipynb b/docs/source/transforms/poly.ipynb
index c2b740b..45c862e 100644
--- a/docs/source/transforms/poly.ipynb
+++ b/docs/source/transforms/poly.ipynb
@@ -168,7 +168,7 @@
"source": [
"## Underlying model\n",
"\n",
- "If we look at `quartic`, we see it is a `Variable`, i.e. it can be used to produce a set of columns\n",
+ "If we look at `quartic`, we see it is a `Feature`, i.e. it can be used to produce a set of columns\n",
"in a design matrix when it is a term used in creating the `ModelSpec`.\n",
"\n",
"Its encoder is `Poly(degree=4)`. This is a special `sklearn` transform that expects a single column\n",
diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index 3b4fd24..0000000
--- a/environment.yml
+++ /dev/null
@@ -1,240 +0,0 @@
-name: islp_test
-channels:
- - defaults
-dependencies:
- - ca-certificates=2022.07.19=hca03da5_0
- - certifi=2022.9.14=py39hca03da5_0
- - libcxx=14.0.6=h848a8c0_0
- - libffi=3.4.2=hc377ac9_4
- - ncurses=6.3=h1a28f6b_3
- - openssl=1.1.1q=h1a28f6b_0
- - python=3.9.13=hbdb9e5c_1
- - readline=8.1.2=h1a28f6b_1
- - sqlite=3.39.2=h1058600_0
- - tk=8.6.12=hb8d0fd4_0
- - wheel=0.37.1=pyhd3eb1b0_0
- - xz=5.2.5=h1a28f6b_1
- - zlib=1.2.12=h5a0b063_3
- - pip:
- - absl-py==1.2.0
- - aiohttp==3.8.1
- - aiosignal==1.2.0
- - alabaster==0.7.12
- - ansiwrap==0.8.4
- - anyio==3.6.1
- - appnope==0.1.3
- - argon2-cffi==21.3.0
- - argon2-cffi-bindings==21.2.0
- - astor==0.8.1
- - asttokens==2.0.8
- - astunparse==1.6.3
- - async-timeout==4.0.2
- - attrs==22.1.0
- - autograd==1.4
- - autograd-gamma==0.5.0
- - babel==2.10.3
- - backcall==0.2.0
- - beautifulsoup4==4.11.1
- - bleach==5.0.1
- - build==0.8.0
- - cachetools==4.2.4
- - cffi==1.15.1
- - charset-normalizer==2.1.1
- - click==8.1.3
- - commonmark==0.9.1
- - contourpy==1.0.5
- - cycler==0.11.0
- - debugpy==1.6.3
- - decorator==5.1.1
- - defusedxml==0.7.1
- - docutils==0.17.1
- - entrypoints==0.4
- - exceptiongroup==1.1.0
- - executing==1.0.0
- - fastjsonschema==2.16.2
- - flatbuffers==2.0.7
- - fonttools==4.37.2
- - formulaic==0.5.2
- - frozenlist==1.3.1
- - fsspec==2022.8.2
- - future==0.18.2
- - gast==0.4.0
- - google-auth==1.35.0
- - google-auth-oauthlib==0.4.6
- - google-pasta==0.2.0
- - grpcio==1.48.1
- - h5py==3.7.0
- - html2text==2020.1.16
- - idna==3.4
- - imagesize==1.4.1
- - importlib-metadata==4.12.0
- - iniconfig==2.0.0
- - interface-meta==1.3.0
- - ipykernel==6.15.3
- - ipython==8.5.0
- - ipython-genutils==0.2.0
- - ipywidgets==8.0.2
- - jaraco-classes==3.2.2
- - jedi==0.18.1
- - jinja2==3.1.2
- - joblib==1.2.0
- - json5==0.9.10
- - jsonschema==4.16.0
- - jupyter==1.0.0
- - jupyter-cache==0.5.0
- - jupyter-client==7.3.5
- - jupyter-console==6.4.4
- - jupyter-core==4.11.1
- - jupyter-server==1.18.1
- - jupyterlab==3.4.7
- - jupyterlab-pygments==0.2.2
- - jupyterlab-server==2.15.1
- - jupyterlab-widgets==3.0.3
- - jupytext==1.14.5
- - keras==2.10.0
- - keras-preprocessing==1.1.2
- - keyring==23.9.3
- - kiwisolver==1.4.4
- - l0bnb==1.0.0
- - libclang==14.0.6
- - lifelines==0.27.2
- - llvmlite==0.39.1
- - lxml==4.9.1
- - markdown==3.4.1
- - markdown-it-py==2.1.0
- - markupsafe==2.1.1
- - matplotlib==3.6.0
- - matplotlib-inline==0.1.6
- - mdit-py-plugins==0.3.0
- - mdurl==0.1.2
- - mistune==2.0.4
- - more-itertools==8.14.0
- - multidict==6.0.2
- - myst==1.0.4
- - myst-nb==0.16.0
- - myst-parser==0.18.0
- - nbclassic==0.4.3
- - nbclient==0.5.13
- - nbconvert==7.0.0
- - nbformat==5.5.0
- - nbsphinx==0.8.11
- - nest-asyncio==1.5.5
- - notebook==6.4.12
- - notebook-shim==0.1.0
- - numba==0.56.2
- - numpy==1.23.3
- - numpydoc==1.4.0
- - oauthlib==3.2.1
- - opt-einsum==3.3.0
- - packaging==21.3
- - pandas==1.5.0
- - pandocfilters==1.5.0
- - papermill==2.4.0
- - parso==0.8.3
- - patsy==0.5.2
- - pep517==0.13.0
- - pexpect==4.8.0
- - pickleshare==0.7.5
- - pillow==9.2.0
- - pip==22.2.2
- - pkginfo==1.8.3
- - pluggy==1.0.0
- - portalocker==2.5.1
- - progressbar2==4.0.0
- - prometheus-client==0.14.1
- - prompt-toolkit==3.0.31
- - protobuf==3.19.5
- - psutil==5.9.2
- - ptyprocess==0.7.0
- - pure-eval==0.2.2
- - pyasn1==0.4.8
- - pyasn1-modules==0.2.8
- - pycparser==2.21
- - pydash==5.1.0
- - pydeprecate==0.3.2
- - pygam==0.8.0
- - pygments==2.13.0
- - pyparsing==3.0.9
- - pyrsistent==0.18.1
- - pytest==7.2.0
- - python-dateutil==2.8.2
- - python-utils==3.3.3
- - pytorch-lightning==1.7.6
- - pytz==2022.2.1
- - pytz-deprecation-shim==0.1.0.post0
- - pyyaml==6.0
- - pyzmq==24.0.0
- - qtconsole==5.3.2
- - qtpy==2.2.0
- - readme-renderer==37.1
- - requests==2.28.1
- - requests-oauthlib==1.3.1
- - requests-toolbelt==0.9.1
- - rfc3986==2.0.0
- - rich==12.5.1
- - rpy2==3.5.7
- - rsa==4.9
- - scikit-learn==1.1.2
- - scipy==1.9.1
- - send2trash==1.8.0
- - setuptools==59.8.0
- - six==1.16.0
- - sniffio==1.3.0
- - snowballstemmer==2.2.0
- - soupsieve==2.3.2.post1
- - sphinx==5.1.1
- - sphinx-markdown-builder==0.5.4
- - sphinx-rst-builder==0.0.3
- - sphinx-rtd-theme==1.1.1
- - sphinx-togglebutton==0.3.2
- - sphinxcontrib-applehelp==1.0.2
- - sphinxcontrib-devhelp==1.0.2
- - sphinxcontrib-htmlhelp==2.0.0
- - sphinxcontrib-jsmath==1.0.1
- - sphinxcontrib-qthelp==1.0.3
- - sphinxcontrib-serializinghtml==1.1.5
- - sqlalchemy==1.4.41
- - stack-data==0.5.0
- - statsmodels==0.13.2
- - tabulate==0.8.10
- - tenacity==6.3.1
- - tensorboard==2.10.0
- - tensorboard-data-server==0.6.1
- - tensorboard-plugin-wit==1.8.1
- - tensorflow-estimator==2.10.0
- - tensorflow-macos==2.10.0
- - tensorflow-metal==0.6.0
- - termcolor==2.0.1
- - terminado==0.15.0
- - texext==0.6.7
- - textwrap3==0.9.2
- - threadpoolctl==3.1.0
- - tinycss2==1.1.1
- - toml==0.10.2
- - tomli==2.0.1
- - torch==1.12.1
- - torchdata==0.4.1
- - torchinfo==1.7.0
- - torchmetrics==0.9.3
- - torchvision==0.13.1
- - tornado==6.2
- - tqdm==4.64.1
- - traitlets==5.4.0
- - twine==4.0.1
- - typing-extensions==4.3.0
- - tzdata==2022.7
- - tzlocal==4.2
- - unidecode==1.3.4
- - unify==0.5
- - untokenize==0.1.1
- - urllib3==1.26.12
- - wcwidth==0.2.5
- - webencodings==0.5.1
- - websocket-client==1.4.1
- - werkzeug==2.2.2
- - widgetsnbextension==4.0.3
- - wrapt==1.14.1
- - yapf==0.32.0
- - yarl==1.8.1
- - zipp==3.8.1
-prefix: /Users/jonathantaylor/miniconda3/envs/islp_test
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..5fe63fd
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,88 @@
+[project]
+name = "ISLP"
+dependencies = ["numpy>=1.7.1",
+ "scipy>=0.9",
+ "pandas>=1.5",
+ "lxml", # pandas needs this for html
+ "scikit-learn>=1.2",
+ "joblib",
+ "statsmodels>=0.13",
+ "lifelines",
+ "pygam", # for GAM in Ch7
+ "torch",
+ "pytorch_lightning",
+ "torchmetrics",
+ ]
+description = "Library for ISLP labs"
+readme = "README.md"
+requires-python = ">=3.10"
+license = {file = "LICENSE"}
+keywords = []
+authors = [
+ {name = "Trevor Hastie", email="hastie@stanford.edu" },
+ {name = "Gareth James", email="gareth@emory.edu"},
+ {name = "Jonathan Taylor", email="jonathan.taylor@stanford.edu" },
+ {name = "Rob Tibshirani", email="tibs@stanford.edu" },
+ {name = "Daniela Witten", email="dwitten@uw.edu" },
+ ]
+maintainers = [
+ {name = "Jonathan Taylor", email="jonathan.taylor@stanford.edu" },
+ ]
+classifiers = ["Development Status :: 3 - Alpha",
+ "Environment :: Console",
+ "Intended Audience :: Science/Research",
+ "License :: OSI Approved :: BSD License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python",
+ "Topic :: Scientific/Engineering"
+ ]
+dynamic = ["version"]
+
+[tool.setuptools]
+packages = [
+ "ISLP",
+ "ISLP.models",
+ "ISLP.bart",
+ "ISLP.torch",
+ "ISLP.data"
+]
+include-package-data = true
+
+[tool.setuptools.package-data]
+ISLP = ["data/*.csv", "data/*.npy", "data/*.data"]
+
+[tool.setuptools.dynamic]
+version = {attr = "ISLP.__version__"} # Assuming ISLP.__version__ holds your version
+
+
+[project.urls] # Optional
+"Homepage" = "https://github.com/intro-stat-learning/ISLP"
+"Bug Reports" = "https://github.com/intro-stat-learning/ISLP/issues"
+"Funding" = "https://donate.pypi.org"
+"Say Thanks!" = "http://saythanks.io/to/example"
+"Source" = "https://github.com/pypa/sampleproject/"
+
+[project.optional-dependencies]
+doc = ['Sphinx>=3.0']
+
+[build-system]
+requires = ["setuptools>=42",
+ "wheel",
+ "Sphinx>=1.0",
+ "numpy",
+ "pandas",
+ "scipy",
+ "scikit-learn",
+ "joblib",
+ "statsmodels",
+ "versioneer[toml]"
+ ]
+build-backend = "setuptools.build_meta"
+
+[tool.versioneer]
+VCS = "git"
+style = "pep440"
+versionfile_source = "ISLP/_version.py"
+versionfile_build = "ISLP/_version.py"
+tag_prefix = "v"
+parentdir_prefix = "ISLP-"
diff --git a/requirements.txt b/requirements.txt
index bf393e1..10bff6a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,13 @@
numpy>=1.7.1
scipy>=0.9
-jupyter
pandas>=0.20
+pandas<=1.9
lxml # pandas needs this for html
-scikit-learn>=1.0
+scikit-learn>=1.2
joblib
statsmodels>=0.13
lifelines
-#l0bnb # for bestsubsets
-#pygam # for GAM in Ch7
+pygam # for GAM in Ch7
+torch
+pytorch_lightning
+torchmetrics
diff --git a/setup.cfg b/setup.cfg
index 14d7ccd..c59c035 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,3 @@
-[versioneer]
-VCS = git
-style = pep440
-versionfile_source = ISLP/_version.py
-tag_prefix =
-parentdir_prefix = ISLP-
+
+[metadata]
+license_files = LICENSE.txt
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100755
index 95fca7d..0000000
--- a/setup.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-''' Installation script for ISLP package '''
-
-import os
-import sys
-from os.path import join as pjoin, dirname, exists
-from distutils.version import LooseVersion
-# BEFORE importing distutils, remove MANIFEST. distutils doesn't properly
-# update it when the contents of directories change.
-if exists('MANIFEST'): os.remove('MANIFEST')
-
-# Unconditionally require setuptools
-import setuptools
-
-# Package for getting versions from git tags
-import versioneer
-
-# Import distutils _after_ setuptools import, and after removing
-# MANIFEST
-from distutils.core import setup
-from distutils.extension import Extension
-
-# Get various parameters for this version, stored in ISLP/info.py
-
-class Bunch(object):
- def __init__(self, vars):
- for key, name in vars.items():
- if key.startswith('__'):
- continue
- self.__dict__[key] = name
-
-def read_vars_from(ver_file):
- """ Read variables from Python text file
-
- Parameters
- ----------
- ver_file : str
- Filename of file to read
-
- Returns
- -------
- info_vars : Bunch instance
- Bunch object where variables read from `ver_file` appear as
- attributes
- """
- # Use exec for compabibility with Python 3
- ns = {}
- with open(ver_file, 'rt') as fobj:
- exec(fobj.read(), ns)
- return Bunch(ns)
-
-info = read_vars_from(pjoin('ISLP', 'info.py'))
-
-class SetupDependency(object):
- """ SetupDependency class
-
- Parameters
- ----------
- import_name : str
- Name with which required package should be ``import``ed.
- min_ver : str
- Distutils version string giving minimum version for package.
- req_type : {'install_requires', 'setup_requires'}, optional
- Setuptools dependency type.
- heavy : {False, True}, optional
- If True, and package is already installed (importable), then do not add
- to the setuptools dependency lists. This prevents setuptools
- reinstalling big packages when the package was installed without using
- setuptools, or this is an upgrade, and we want to avoid the pip default
- behavior of upgrading all dependencies.
- install_name : str, optional
- Name identifying package to install from pypi etc, if different from
- `import_name`.
- """
-
- def __init__(self, import_name,
- min_ver,
- req_type='install_requires',
- heavy=False,
- install_name=None):
- self.import_name = import_name
- self.min_ver = min_ver
- self.req_type = req_type
- self.heavy = heavy
- self.install_name = (import_name if install_name is None
- else install_name)
-
- def check_fill(self, setuptools_kwargs):
- """ Process this dependency, maybe filling `setuptools_kwargs`
-
- Run checks on this dependency. If not using setuptools, then raise
- error for unmet dependencies. If using setuptools, add missing or
- not-heavy dependencies to `setuptools_kwargs`.
-
- A heavy dependency is one that is inconvenient to install
- automatically, such as numpy or (particularly) scipy, matplotlib.
-
- Parameters
- ----------
- setuptools_kwargs : dict
- Dictionary of setuptools keyword arguments that may be modified
- in-place while checking dependencies.
- """
- found_ver = get_pkg_version(self.import_name)
- ver_err_msg = version_error_msg(self.import_name,
- found_ver,
- self.min_ver)
- if not 'setuptools' in sys.modules:
- # Not using setuptools; raise error for any unmet dependencies
- if ver_err_msg is not None:
- raise RuntimeError(ver_err_msg)
- return
- # Using setuptools; add packages to given section of
- # setup/install_requires, unless it's a heavy dependency for which we
- # already have an acceptable importable version.
- if self.heavy and ver_err_msg is None:
- return
- new_req = '{0}>={1}'.format(self.import_name, self.min_ver)
- old_reqs = setuptools_kwargs.get(self.req_type, [])
- setuptools_kwargs[self.req_type] = old_reqs + [new_req]
-
-def get_pkg_version(pkg_name):
- """ Return package version for `pkg_name` if installed
-
- Returns
- -------
- pkg_version : str or None
- Return None if package not importable. Return 'unknown' if standard
- ``__version__`` string not present. Otherwise return version string.
- """
- try:
- pkg = __import__(pkg_name)
- except ImportError:
- return None
- try:
- return pkg.__version__
- except AttributeError:
- return 'unknown'
-
-def version_error_msg(pkg_name, found_ver, min_ver):
- """ Return informative error message for version or None
- """
- if found_ver is None:
- return 'We need package {0}, but not importable'.format(pkg_name)
- if found_ver == 'unknown':
- return 'We need {0} version {1}, but cannot get version'.format(
- pkg_name, min_ver)
- if LooseVersion(found_ver) >= LooseVersion(min_ver):
- return None
- return 'We need {0} version {1}, but found version {2}'.format(
- pkg_name, found_ver, min_ver)
-
-
-
-# Try to preempt setuptools monkeypatching of Extension handling when Pyrex
-# is missing. Otherwise the monkeypatched Extension will change .pyx
-# filenames to .c filenames, and we probably don't have the .c files.
-sys.path.insert(0, pjoin(dirname(__file__), 'fake_pyrex'))
-# Set setuptools extra arguments
-extra_setuptools_args = dict(
- tests_require=['nose'],
- test_suite='nose.collector',
- zip_safe=False,
- extras_require = dict(
- doc=['Sphinx>=1.0'],
- test=['nose>=0.10.1']))
-
-# Define extensions
-EXTS = []
-
-SetupDependency('numpy', info.NUMPY_MIN_VERSION,
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-SetupDependency('scipy', info.SCIPY_MIN_VERSION,
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-SetupDependency('matplotlib', info.MATPLOTLIB_MIN_VERSION,
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-SetupDependency('pandas', info.PANDAS_MIN_VERSION,
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-SetupDependency('statsmodels', info.STATSMODELS_MIN_VERSION,
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-SetupDependency('scikit-learn', info.SKLEARN_MIN_VERSION,
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-
-#requirements = open('requirements.txt').read().strip().split('\n')
-
-requirements = '''numpy
-scipy
-jupyter
-pandas
-lxml # pandas needs this for html
-scikit-learn
-joblib
-pygam # for GAM in Ch7
-lifelines'''.split('\n')
-#l0bnb # for bestsubsets
-
-
-
-for req in requirements:
- req = req.split('#')[0]
- import sys; sys.stderr.write(req+'\n')
- SetupDependency(req, "0.0",
- req_type='install_requires',
- heavy=True).check_fill(extra_setuptools_args)
-
-cmdclass=versioneer.get_cmdclass()
-
-# get long_description
-
-if sys.version_info[0] > 2:
- long_description = open('README.md', 'rt', encoding='utf-8').read()
-else:
- long_description = unicode(file('README.md').read(), 'utf-8')
-
-def main(**extra_args):
- setup(name=info.NAME,
- maintainer=info.MAINTAINER,
- maintainer_email=info.MAINTAINER_EMAIL,
- description=info.DESCRIPTION,
- url=info.URL,
- download_url=info.DOWNLOAD_URL,
- license=info.LICENSE,
- classifiers=info.CLASSIFIERS,
- author=info.AUTHOR,
- author_email=info.AUTHOR_EMAIL,
- platforms=info.PLATFORMS,
- version=versioneer.get_version(),
- requires=info.REQUIRES,
- provides=info.PROVIDES,
- packages = ['ISLP',
- 'ISLP.models',
- 'ISLP.bart',
- 'ISLP.torch'
- ],
- ext_modules = EXTS,
- package_data = {"ISLP":["data/*csv", "data/*npy", "data/*data"]},
- include_package_data=True,
- data_files=[],
- scripts=[],
- long_description=long_description,
- cmdclass = cmdclass,
- **extra_args
- )
-
-#simple way to test what setup will do
-#python setup.py install --prefix=/tmp
-if __name__ == "__main__":
- main(**extra_setuptools_args)
diff --git a/ISLP/bart/tests/test_bart.py b/tests/bart/test_bart.py
similarity index 96%
rename from ISLP/bart/tests/test_bart.py
rename to tests/bart/test_bart.py
index d12a0a2..903bb83 100644
--- a/ISLP/bart/tests/test_bart.py
+++ b/tests/bart/test_bart.py
@@ -19,8 +19,6 @@ def test_bart():
clone(B)
- return B
-
if __name__ == "__main__":
test_bart()
diff --git a/ISLP/models/tests/__init__.py b/tests/deeplearning/__init__.py
similarity index 100%
rename from ISLP/models/tests/__init__.py
rename to tests/deeplearning/__init__.py
diff --git a/tests/deeplearning/test_hitters.py b/tests/deeplearning/test_hitters.py
new file mode 100644
index 0000000..bf609b9
--- /dev/null
+++ b/tests/deeplearning/test_hitters.py
@@ -0,0 +1,481 @@
+import numpy as np
+import pandas as pd
+from matplotlib.pyplot import subplots
+from sklearn.linear_model import \
+ (LinearRegression,
+ Lasso)
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import KFold
+from sklearn.pipeline import Pipeline
+from ISLP import load_data
+from ISLP.models import ModelSpec as MS
+from sklearn.model_selection import \
+ (train_test_split,
+ GridSearchCV)
+
+# torch
+
+import torch
+from torch import nn
+from torch.utils.data import TensorDataset
+
+# torch helpers
+
+from torchmetrics import MeanAbsoluteError
+from torchinfo import summary
+
+# pytorch lightning
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import CSVLogger
+
+# setting seed
+
+from pytorch_lightning import seed_everything
+seed_everything(0, workers=True)
+torch.use_deterministic_algorithms(True, warn_only=True)
+
+# ISLP.torch
+
+from ISLP.torch import (SimpleDataModule,
+ SimpleModule,
+ ErrorTracker,
+ rec_num_workers)
+
+
+def test_hitters(max_epochs=2,
+ num_lam=5):
+
+ Hitters = load_data('Hitters').dropna()
+ n = Hitters.shape[0]
+
+ # We will fit two linear models (least squares and lasso) and compare their performance
+ # to that of a neural network. For this comparison we will use mean absolute error on a validation dataset.
+ # \begin{equation*}
+ # \begin{split}
+ # \mbox{MAE}(y,\hat{y}) = \frac{1}{n} \sum_{i=1}^n |y_i-\hat{y}_i|.
+ # \end{split}
+ # \end{equation*}
+ # We set up the model matrix and the response.
+
+ # In[11]:
+
+
+ model = MS(Hitters.columns.drop('Salary'), intercept=False)
+ X = model.fit_transform(Hitters).to_numpy()
+ Y = Hitters['Salary'].to_numpy()
+
+
+ # The `to_numpy()` method above converts `pandas`
+ # data frames or series to `numpy` arrays.
+ # We do this because we will need to use `sklearn` to fit the lasso model,
+ # and it requires this conversion.
+ # We also use a linear regression method from `sklearn`, rather than the method
+ # in Chapter~3 from `statsmodels`, to facilitate the comparisons.
+
+ # We now split the data into test and training, fixing the random
+ # state used by `sklearn` to do the split.
+
+ # In[12]:
+
+
+ (X_train,
+ X_test,
+ Y_train,
+ Y_test) = train_test_split(X,
+ Y,
+ test_size=1/3,
+ random_state=1)
+
+
+ # ### Linear Models
+ # We fit the linear model and evaluate the test error directly.
+
+ # In[13]:
+
+
+ hit_lm = LinearRegression().fit(X_train, Y_train)
+ Yhat_test = hit_lm.predict(X_test)
+ np.abs(Yhat_test - Y_test).mean()
+
+
+ # Next we fit the lasso using `sklearn`. We are using
+ # mean absolute error to select and evaluate a model, rather than mean squared error.
+ # The specialized solver we used in Section 6.5.2 uses only mean squared error. So here, with a bit more work, we create a cross-validation grid and perform the cross-validation directly.
+ #
+ # We encode a pipeline with two steps: we first normalize the features using a `StandardScaler()` transform,
+ # and then fit the lasso without further normalization.
+
+ # In[14]:
+
+
+ scaler = StandardScaler(with_mean=True, with_std=True)
+ lasso = Lasso(warm_start=True, max_iter=30000)
+ standard_lasso = Pipeline(steps=[('scaler', scaler),
+ ('lasso', lasso)])
+
+
+ # We need to create a grid of values for $\lambda$. As is common practice,
+ # we choose a grid of 100 values of $\lambda$, uniform on the log scale from `lam_max` down to `0.01*lam_max`. Here `lam_max` is the smallest value of
+ # $\lambda$ with an all-zero solution. This value equals the largest absolute inner-product between any predictor and the (centered) response. {The derivation of this result is beyond the scope of this book.}
+
+ # In[15]:
+
+
+ X_s = scaler.fit_transform(X_train)
+ n = X_s.shape[0]
+ lam_max = np.fabs(X_s.T.dot(Y_train - Y_train.mean())).max() / n
+ param_grid = {'alpha': np.exp(np.linspace(0, np.log(0.01), num_lam))
+ * lam_max}
+
+
+ # Note that we had to transform the data first, since the scale of the variables impacts the choice of $\lambda$.
+ # We now perform cross-validation using this sequence of $\lambda$ values.
+
+ # In[16]:
+
+
+ cv = KFold(10,
+ shuffle=True,
+ random_state=1)
+ grid = GridSearchCV(lasso,
+ param_grid,
+ cv=cv,
+ scoring='neg_mean_absolute_error')
+ grid.fit(X_train, Y_train);
+
+
+ # We extract the lasso model with best cross-validated mean absolute error, and evaluate its
+ # performance on `X_test` and `Y_test`, which were not used in
+ # cross-validation.
+
+ # In[17]:
+
+
+ trained_lasso = grid.best_estimator_
+ Yhat_test = trained_lasso.predict(X_test)
+ np.fabs(Yhat_test - Y_test).mean()
+
+
+ # This is similar to the results we got for the linear model fit by least squares. However, these results can vary a lot for different train/test splits; we encourage the reader to try a different seed in code block 12 and rerun the subsequent code up to this point.
+ #
+ # ### Specifying a Network: Classes and Inheritance
+ # To fit the neural network, we first set up a model structure
+ # that describes the network.
+ # Doing so requires us to define new classes specific to the model we wish to fit.
+ # Typically this is done in `pytorch` by sub-classing a generic
+ # representation of a network, which is the approach we take here.
+ # Although this example is simple, we will go through the steps in some detail, since it will serve us well
+ # for the more complex examples to follow.
+
+ # In[18]:
+
+
+ class HittersModel(nn.Module):
+
+ def __init__(self, input_size):
+ super(HittersModel, self).__init__()
+ self.flatten = nn.Flatten()
+ self.sequential = nn.Sequential(
+ nn.Linear(input_size, 50),
+ nn.ReLU(),
+ nn.Dropout(0.4),
+ nn.Linear(50, 1))
+
+ def forward(self, x):
+ x = self.flatten(x)
+ return torch.flatten(self.sequential(x))
+
+
+ # The `class` statement identifies the code chunk as a
+ # declaration for a class `HittersModel`
+ # that inherits from the base class `nn.Module`. This base
+ # class is ubiquitous in `torch` and represents the
+ # mappings in the neural networks.
+ #
+ # Indented beneath the `class` statement are the methods of this class:
+ # in this case `__init__` and `forward`. The `__init__` method is
+ # called when an instance of the class is created as in the cell
+ # below. In the methods, `self` always refers to an instance of the
+ # class. In the `__init__` method, we have attached two objects to
+ # `self` as attributes: `flatten` and `sequential`. These are used in
+ # the `forward` method to describe the map that this module implements.
+ #
+ # There is one additional line in the `__init__` method, which
+ # is a call to
+ # `super()`. This function allows subclasses (i.e. `HittersModel`)
+ # to access methods of the class they inherit from. For example,
+ # the class `nn.Module` has its own `__init__` method, which is different from
+ # the `HittersModel.__init__()` method we’ve written above.
+ # Using `super()` allows us to call the method of the base class. For
+ # `torch` models, we will always be making this `super()` call as it is necessary
+ # for the model to be properly interpreted by `torch`.
+ #
+ # The object `nn.Module` has more methods than simply `__init__` and `forward`. These
+ # methods are directly accessible to `HittersModel` instances because of this inheritance.
+ # One such method we will see shortly is the `eval()` method, used
+ # to disable dropout for when we want to evaluate the model on test data.
+
+ # In[19]:
+
+
+ hit_model = HittersModel(X.shape[1])
+
+
+ # The object `self.sequential` is a composition of four maps. The
+ # first maps the 19 features of `Hitters` to 50 dimensions, introducing $50\times 19+50$ parameters
+ # for the weights and *intercept* of the map (often called the *bias*). This layer
+ # is then mapped to a ReLU layer followed by a 40% dropout layer, and finally a
+ # linear map down to 1 dimension, again with a bias. The total number of
+ # trainable parameters is therefore $50\times 19+50+50+1=1051$.
+
+ # The package `torchinfo` provides a `summary()` function that neatly summarizes
+ # this information. We specify the size of the input and see the size
+ # of each tensor as it passes through layers of the network.
+
+ # In[20]:
+
+
+ summary(hit_model,
+ input_size=X_train.shape,
+ col_names=['input_size',
+ 'output_size',
+ 'num_params'])
+
+
+ # We have truncated the end of the output slightly, here and in subsequent uses.
+ #
+ # We now need to transform our training data into a form accessible to `torch`.
+ # The basic
+ # datatype in `torch` is a `tensor`, which is very similar
+ # to an `ndarray` from early chapters.
+ # We also note here that `torch` typically
+ # works with 32-bit (*single precision*)
+ # rather than 64-bit (*double precision*) floating point numbers.
+ # We therefore convert our data to `np.float32` before
+ # forming the tensor.
+ # The $X$ and $Y$ tensors are then arranged into a `Dataset`
+ # recognized by `torch`
+ # using `TensorDataset()`.
+
+ # In[21]:
+
+
+ X_train_t = torch.tensor(X_train.astype(np.float32))
+ Y_train_t = torch.tensor(Y_train.astype(np.float32))
+ hit_train = TensorDataset(X_train_t, Y_train_t)
+
+
+ # We do the same for the test data.
+
+ # In[22]:
+
+
+ X_test_t = torch.tensor(X_test.astype(np.float32))
+ Y_test_t = torch.tensor(Y_test.astype(np.float32))
+ hit_test = TensorDataset(X_test_t, Y_test_t)
+
+
+ # Finally, this dataset is passed to a `DataLoader()` which ultimately
+ # passes data into our network. While this may seem
+ # like a lot of overhead, this structure is helpful for more
+ # complex tasks where data may live on different machines,
+ # or where data must be passed to a GPU.
+ # We provide a helper function `SimpleDataModule()` in `ISLP` to make this task easier for
+ # standard usage.
+ # One of its arguments is `num_workers`, which indicates
+ # how many processes we will use
+ # for loading the data. For small
+ # data like `Hitters` this will have little effect, but
+ # it does provide an advantage for the `MNIST` and `CIFAR100` examples below.
+ # The `torch` package will inspect the process running and determine a
+ # maximum number of workers. {This depends on the computing hardware and the number of cores available.} We’ve included a function
+ # `rec_num_workers()` to compute this so we know how many
+ # workers might be reasonable (here the max was 16).
+
+ # In[23]:
+
+
+ max_num_workers = rec_num_workers()
+
+
+ # The general training setup in `pytorch_lightning` involves
+ # training, validation and test data. These are each
+ # represented by different data loaders. During each epoch,
+ # we run a training step to learn the model and a validation
+ # step to track the error. The test data is typically
+ # used at the end of training to evaluate the model.
+ #
+ # In this case, as we had split only into test and training,
+ # we’ll use the test data as validation data with the
+ # argument `validation=hit_test`. The
+ # `validation` argument can be a float between 0 and 1, an
+ # integer, or a
+ # `Dataset`. If a float (respectively, integer), it is interpreted
+ # as a percentage (respectively number) of the *training* observations to be used for validation.
+ # If it is a `Dataset`, it is passed directly to a data loader.
+
+ # In[24]:
+
+
+ hit_dm = SimpleDataModule(hit_train,
+ hit_test,
+ batch_size=32,
+ num_workers=min(4, max_num_workers),
+ validation=hit_test)
+
+
+ # Next we must provide a `pytorch_lightning` module that controls
+ # the steps performed during the training process. We provide methods for our
+ # `SimpleModule()` that simply record the value
+ # of the loss function and any additional
+ # metrics at the end of each epoch. These operations
+ # are controlled by the methods `SimpleModule.[training/test/validation]_step()`, though
+ # we will not be modifying these in our examples.
+
+ # In[25]:
+
+
+ hit_module = SimpleModule.regression(hit_model,
+ metrics={'mae':MeanAbsoluteError()})
+
+
+ # By using the `SimpleModule.regression()` method, we indicate that we will use squared-error loss as in
+ # (10.23).
+ # We have also asked for mean absolute error to be tracked as well
+ # in the metrics that are logged.
+ #
+ # We log our results via `CSVLogger()`, which in this case stores the results in a CSV file within a directory `logs/hitters`. After the fitting is complete, this allows us to load the
+ # results as a `pd.DataFrame()` and visualize them below. There are
+ # several ways to log the results within `pytorch_lightning`, though
+ # we will not cover those here in detail.
+
+ # In[26]:
+
+
+ hit_logger = CSVLogger('logs', name='hitters')
+
+
+ # Finally we are ready to train our model and log the results. We
+ # use the `Trainer()` object from `pytorch_lightning`
+ # to do this work. The argument `datamodule=hit_dm` tells the trainer
+ # how training/validation/test logs are produced,
+ # while the first argument `hit_module`
+ # specifies the network architecture
+ # as well as the training/validation/test steps.
+ # The `callbacks` argument allows for
+ # several tasks to be carried out at various
+ # points while training a model. Here
+ # our `ErrorTracker()` callback will enable
+ # us to compute validation error while training
+ # and, finally, the test error.
+ # We now fit the model for 50 epochs.
+
+ # In[27]:
+
+
+ hit_trainer = Trainer(deterministic=True,
+ max_epochs=max_epochs,
+ log_every_n_steps=5,
+ logger=hit_logger,
+ callbacks=[ErrorTracker()])
+ hit_trainer.fit(hit_module, datamodule=hit_dm)
+
+
+ # At each step of SGD, the algorithm randomly selects 32 training observations for
+ # the computation of the gradient. Recall from Section 10.7
+ # that an epoch amounts to the number of SGD steps required to process $n$
+ # observations. Since the training set has
+ # $n=175$, and we specified a `batch_size` of 32 in the construction of `hit_dm`, an epoch is $175/32=5.5$ SGD steps.
+ #
+ # After having fit the model, we can evaluate performance on our test
+ # data using the `test()` method of our trainer.
+
+ # In[28]:
+
+
+ hit_trainer.test(hit_module, datamodule=hit_dm)
+
+
+ # The results of the fit have been logged into a CSV file. We can find the
+ # results specific to this run in the `experiment.metrics_file_path`
+ # attribute of our logger. Note that each time the model is fit, the logger will output
+ # results into a new subdirectory of our directory `logs/hitters`.
+ #
+ # We now create a plot of the MAE (mean absolute error) as a function of
+ # the number of epochs.
+ # First we retrieve the logged summaries.
+
+ # In[29]:
+
+
+ hit_results = pd.read_csv(hit_logger.experiment.metrics_file_path)
+
+
+ # Since we will produce similar plots in later examples, we write a
+ # simple generic function to produce this plot.
+
+ # In[30]:
+
+
+ def summary_plot(results,
+ ax,
+ col='loss',
+ valid_legend='Validation',
+ training_legend='Training',
+ ylabel='Loss',
+ fontsize=20):
+ for (column,
+ color,
+ label) in zip([f'train_{col}_epoch',
+ f'valid_{col}'],
+ ['black',
+ 'red'],
+ [training_legend,
+ valid_legend]):
+ results.plot(x='epoch',
+ y=column,
+ label=label,
+ marker='o',
+ color=color,
+ ax=ax)
+ ax.set_xlabel('Epoch')
+ ax.set_ylabel(ylabel)
+ return ax
+
+
+ # We now set up our axes, and use our function to produce the MAE plot.
+
+ # In[31]:
+
+
+ fig, ax = subplots(1, 1, figsize=(6, 6))
+ ax = summary_plot(hit_results,
+ ax,
+ col='mae',
+ ylabel='MAE',
+ valid_legend='Validation (=Test)')
+ ax.set_ylim([0, 400])
+ ax.set_xticks(np.linspace(0, 50, 11).astype(int));
+
+
+ # We can predict directly from the final model, and
+ # evaluate its performance on the test data.
+ # Before fitting, we call the `eval()` method
+ # of `hit_model`.
+ # This tells
+ # `torch` to effectively consider this model to be fitted, so that
+ # we can use it to predict on new data. For our model here,
+ # the biggest change is that the dropout layers will
+ # be turned off, i.e. no weights will be randomly
+ # dropped in predicting on new data.
+
+ # In[32]:
+
+
+ hit_model.eval()
+ preds = hit_module(X_test_t)
+ torch.abs(Y_test_t - preds).mean()
+
+
+
diff --git a/tests/deeplearning/test_mnist.py b/tests/deeplearning/test_mnist.py
new file mode 100644
index 0000000..c6d39d9
--- /dev/null
+++ b/tests/deeplearning/test_mnist.py
@@ -0,0 +1,258 @@
+
+# torch
+
+import torch
+from torch import nn
+
+# torch helpers
+
+from torchinfo import summary
+
+# pytorch lightning
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import CSVLogger
+
+# setting seed
+
+from pytorch_lightning import seed_everything
+seed_everything(0, workers=True)
+torch.use_deterministic_algorithms(True, warn_only=True)
+
+# ISLP.torch
+
+from ISLP.torch import (SimpleDataModule,
+ SimpleModule,
+ ErrorTracker)
+
+from torchvision.datasets import MNIST
+from torchvision.transforms import ToTensor
+
+def test_mnist(max_epochs=2):
+
+
+ # ## Multilayer Network on the MNIST Digit Data
+ # The `torchvision` package comes with a number of example datasets,
+ # including the `MNIST` digit data. Our first step is to retrieve
+ # the training and test data sets; the `MNIST()` function within
+ # `torchvision.datasets` is provided for this purpose. The
+ # data will be downloaded the first time this function is executed, and stored in the directory `data/MNIST`.
+
+ # In[34]:
+
+
+ (mnist_train,
+ mnist_test) = [MNIST(root='data',
+ train=train,
+ download=True,
+ transform=ToTensor())
+ for train in [True, False]]
+ mnist_train
+
+
+ # There are 60,000 images in the training data and 10,000 in the test
+ # data. The images are $28\times 28$, and stored as a matrix of pixels. We
+ # need to transform each one into a vector.
+ #
+ # Neural networks are somewhat sensitive to the scale of the inputs, much as ridge and
+ # lasso regularization are affected by scaling. Here the inputs are eight-bit
+ # grayscale values between 0 and 255, so we rescale to the unit
+ # interval. {Note: eight bits means $2^8$, which equals 256. Since the convention
+ # is to start at $0$, the possible values range from $0$ to $255$.}
+ # This transformation, along with some reordering
+ # of the axes, is performed by the `ToTensor()` transform
+ # from the `torchvision.transforms` package.
+ #
+ # As in our `Hitters` example, we form a data module
+ # from the training and test datasets, setting aside 20%
+ # of the training images for validation.
+
+ # In[35]:
+
+
+ mnist_dm = SimpleDataModule(mnist_train,
+ mnist_test,
+ validation=0.2,
+ num_workers=2,
+ batch_size=256)
+
+
+ # Let’s take a look at the data that will get fed into our network. We loop through the first few
+ # chunks of the test dataset, breaking after 2 batches:
+
+ # In[36]:
+
+
+ for idx, (X_ ,Y_) in enumerate(mnist_dm.train_dataloader()):
+ print('X: ', X_.shape)
+ print('Y: ', Y_.shape)
+ if idx >= 1:
+ break
+
+
+ # We see that the $X$ for each batch consists of 256 images of size `1x28x28`.
+ # Here the `1` indicates a single channel (greyscale). For RGB images such as `CIFAR100` below,
+ # we will see that the `1` in the size will be replaced by `3` for the three RGB channels.
+ #
+ # Now we are ready to specify our neural network.
+
+ # In[37]:
+
+
+ class MNISTModel(nn.Module):
+ def __init__(self):
+ super(MNISTModel, self).__init__()
+ self.layer1 = nn.Sequential(
+ nn.Flatten(),
+ nn.Linear(28*28, 256),
+ nn.ReLU(),
+ nn.Dropout(0.4))
+ self.layer2 = nn.Sequential(
+ nn.Linear(256, 128),
+ nn.ReLU(),
+ nn.Dropout(0.3))
+ self._forward = nn.Sequential(
+ self.layer1,
+ self.layer2,
+ nn.Linear(128, 10))
+ def forward(self, x):
+ return self._forward(x)
+
+
+ # We see that in the first layer, each `1x28x28` image is flattened, then mapped to
+ # 256 dimensions where we apply a ReLU activation with 40% dropout.
+ # A second layer maps the first layer’s output down to
+ # 128 dimensions, applying a ReLU activation with 30% dropout. Finally,
+ # the 128 dimensions are mapped down to 10, the number of classes in the
+ # `MNIST` data.
+
+ # In[38]:
+
+
+ mnist_model = MNISTModel()
+
+
+ # We can check that the model produces output of expected size based
+ # on our existing batch `X_` above.
+
+ # In[39]:
+
+
+ mnist_model(X_).size()
+
+
+ # Let’s take a look at the summary of the model. Instead of an `input_size` we can pass
+ # a tensor of correct shape. In this case, we pass through the final
+ # batched `X_` from above.
+
+ # In[40]:
+
+
+ summary(mnist_model,
+ input_data=X_,
+ col_names=['input_size',
+ 'output_size',
+ 'num_params'])
+
+
+ # Having set up both the model and the data module, fitting this model is
+ # now almost identical to the `Hitters` example. In contrast to our regression model, here we will use the
+ # `SimpleModule.classification()` method which
+ # uses the cross-entropy loss function instead of mean squared error. It must be supplied with the number of classes in the problem.
+
+ # In[41]:
+
+
+ mnist_module = SimpleModule.classification(mnist_model,
+ num_classes=10)
+ mnist_logger = CSVLogger('logs', name='MNIST')
+
+
+ # Now we are ready to go. The final step is to supply training data, and fit the model.
+
+ # In[42]:
+
+
+ mnist_trainer = Trainer(deterministic=True,
+ max_epochs=max_epochs,
+ logger=mnist_logger,
+ callbacks=[ErrorTracker()])
+ mnist_trainer.fit(mnist_module,
+ datamodule=mnist_dm)
+
+
+ # We have suppressed the output here, which is a progress report on the
+ # fitting of the model, grouped by epoch. This is very useful, since on
+ # large datasets fitting can take time. Fitting this model took 245
+ # seconds on a MacBook Pro with an Apple M1 Pro chip with 10 cores and 16 GB of RAM.
+ # Here we specified a
+ # validation split of 20%, so training is actually performed on
+ # 80% of the 60,000 observations in the training set. This is an
+ # alternative to actually supplying validation data, like we did for the `Hitters` data.
+ # SGD uses batches
+ # of 256 observations in computing the gradient, and doing the
+ # arithmetic, we see that an epoch corresponds to 188 gradient steps.
+
+ # `SimpleModule.classification()` includes
+ # an accuracy metric by default. Other
+ # classification metrics can be added from `torchmetrics`.
+ # We will use our `summary_plot()` function to display
+ # accuracy across epochs.
+
+
+ mnist_trainer.test(mnist_module,
+ datamodule=mnist_dm)
+
+
+ # Table 10.1 also reports the error rates resulting from LDA (Chapter 4) and multiclass logistic
+ # regression. For LDA we refer the reader to Section 4.7.3.
+ # Although we could use the `sklearn` function `LogisticRegression()` to fit
+ # multiclass logistic regression, we are set up here to fit such a model
+ # with `torch`.
+ # We just have an input layer and an output layer, and omit the hidden layers!
+
+ # In[45]:
+
+
+ class MNIST_MLR(nn.Module):
+ def __init__(self):
+ super(MNIST_MLR, self).__init__()
+ self.linear = nn.Sequential(nn.Flatten(),
+ nn.Linear(784, 10))
+ def forward(self, x):
+ return self.linear(x)
+
+ mlr_model = MNIST_MLR()
+ mlr_module = SimpleModule.classification(mlr_model,
+ num_classes=10)
+ mlr_logger = CSVLogger('logs', name='MNIST_MLR')
+
+
+ # In[46]:
+
+
+ mlr_trainer = Trainer(deterministic=True,
+ max_epochs=30,
+ callbacks=[ErrorTracker()])
+ mlr_trainer.fit(mlr_module, datamodule=mnist_dm)
+
+
+ # We fit the model just as before and compute the test results.
+
+ # In[47]:
+
+
+ mlr_trainer.test(mlr_module,
+ datamodule=mnist_dm)
+
+
+ # The accuracy is above 90% even for this pretty simple model.
+ #
+ # As in the `Hitters` example, we delete some of
+ # the objects we created above.
+
+ # In[48]:
+
+
+
+
diff --git a/tests/models/test_boolean_columns.py b/tests/models/test_boolean_columns.py
new file mode 100644
index 0000000..7b5a429
--- /dev/null
+++ b/tests/models/test_boolean_columns.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import statsmodels.api as sm
+import numpy as np
+from itertools import combinations
+
+from ISLP.models import ModelSpec as MS
+
+rng = np.random.default_rng(0)
+
+df = pd.DataFrame({'A':rng.standard_normal(10),
+ 'B':np.array([1,2,3,2,1,1,1,3,2,1], int),
+ 'C':np.array([True,False,False,True,True]*2, bool),
+ 'D':rng.standard_normal(10)})
+Y = rng.standard_normal(10)
+
+def test_all():
+
+ for i in range(1, 5):
+ for comb in combinations(['A','B','C','D'], i):
+
+ X = MS(comb).fit_transform(df)
+ sm.OLS(Y, X).fit()
+
diff --git a/ISLP/models/tests/test_columns.py b/tests/models/test_columns.py
similarity index 79%
rename from ISLP/models/tests/test_columns.py
rename to tests/models/test_columns.py
index a86941b..77ba784 100644
--- a/ISLP/models/tests/test_columns.py
+++ b/tests/models/test_columns.py
@@ -3,6 +3,7 @@
from pandas.api.types import CategoricalDtype
from ISLP.models.columns import _get_column_info
+from ISLP.models.model_spec import Contrast
def test_column_info():
@@ -15,5 +16,7 @@ def test_column_info():
print(_get_column_info(df,
df.columns,
[False]*4+[True],
- [False]*5))
+ [False]*5,
+ categorical_encoders={'categorical':Contrast(method='drop')}))
+
diff --git a/ISLP/models/tests/test_model_matrix.py b/tests/models/test_model_matrix.py
similarity index 86%
rename from ISLP/models/tests/test_model_matrix.py
rename to tests/models/test_model_matrix.py
index 51e079c..70b9cab 100644
--- a/ISLP/models/tests/test_model_matrix.py
+++ b/tests/models/test_model_matrix.py
@@ -2,7 +2,7 @@
from sklearn.base import clone
from ISLP.transforms import Poly, NaturalSpline, BSpline, Interaction
-from ISLP.models.model_spec import ModelSpec, Variable, ns, bs, poly, pca, contrast, Contrast
+from ISLP.models.model_spec import ModelSpec, Feature, ns, bs, poly, pca, contrast, Contrast, build_model
from sklearn.preprocessing import (OneHotEncoder,
OrdinalEncoder)
@@ -37,7 +37,7 @@ def test_ndarray():
X = rng.standard_normal((50,5))
M = ModelSpec(terms=[1, (3,2)],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
M.fit(X)
MX = M.transform(X)
@@ -51,7 +51,7 @@ def test_dataframe1():
D = pd.DataFrame(X, columns=['A','B','C','D','E'])
M = ModelSpec(terms=['A','D',('D','E')],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
clone(M)
MX = np.asarray(M.fit_transform(D))
@@ -66,7 +66,7 @@ def test_dataframe2():
D = pd.DataFrame(X, columns=['V','B','A','D','E'])
M = ModelSpec(terms=['A', 'D', 'B', ('D','E'), 'V'],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
clone(M)
MX = M.fit_transform(D)
@@ -83,7 +83,7 @@ def test_dataframe3():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))
M = ModelSpec(terms=['A', 'E', ('D','E')],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = np.asarray(M.fit_transform(D))
M2 = clone(M)
@@ -105,7 +105,7 @@ def test_dataframe4():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))
M = ModelSpec(terms=['A', 'E', ('D','E'), 'D'],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = np.asarray(M.fit_transform(D))
DE = pd.get_dummies(D['E'])
@@ -119,7 +119,6 @@ def test_dataframe4():
np.testing.assert_allclose(MX, MX2)
print(MX2.columns)
- return M, D
def test_dataframe5():
@@ -130,7 +129,7 @@ def test_dataframe5():
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))
M = ModelSpec(terms=['A', 'E', ('D','E')],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = np.asarray(M.fit_transform(D))
# check they agree on copy of dataframe
@@ -144,12 +143,12 @@ def test_dataframe6():
rng = np.random.default_rng(11)
X = rng.standard_normal((50,5))
D = pd.DataFrame(X, columns=['A','B','C','D','E'])
- W = Variable(('A','E'), 'AE', None)
+ W = Feature(('A','E'), 'AE', None)
D['D'] = pd.Categorical(rng.choice(['a','b','c'], 50, replace=True))
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))
M = ModelSpec(terms=['A',W,(W,'D',)],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = M.fit_transform(D)
MX = np.asarray(MX)
@@ -163,7 +162,7 @@ def test_dataframe7():
D['Eee'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))
M = ModelSpec(terms=D.columns.drop(['Y','C']),
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = M.fit_transform(D)
print(MX.columns)
MX = np.asarray(MX)
@@ -178,9 +177,9 @@ def test_dataframe8():
poly = Poly(degree=3)
# raises a ValueError because poly will have been already fit -- need new instance of Poly
- W = Variable(('A',), 'poly(A)', poly)
+ W = Feature(('A',), 'poly(A)', poly)
M = ModelSpec(terms=list(D.columns.drop(['Y','C'])) + [(W,'E')],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = M.fit_transform(D)
print(MX.columns)
@@ -196,10 +195,10 @@ def test_dataframe9():
poly = Poly(degree=3)
# raises a ValueError because poly will have been already fit -- need new instance of Poly
- W = Variable(('A',), 'poly(A)', poly)
- U = Variable(('B',), 'poly(B)', clone(poly))
+ W = Feature(('A',), 'poly(A)', poly)
+ U = Feature(('B',), 'poly(B)', clone(poly))
M = ModelSpec(terms=list(D.columns.drop(['Y','C'])) + [W,U],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = M.fit_transform(D)
print(MX.columns)
@@ -210,13 +209,13 @@ def test_dataframe10():
rng = np.random.default_rng(15)
X = rng.standard_normal((50,5))
D = pd.DataFrame(X, columns=['A','B','C','D','E'])
- W = Variable(('A','E'), 'AE', None)
- U = Variable((W, 'C'), 'WC', None)
+ W = Feature(('A','E'), 'AE', None)
+ U = Feature((W, 'C'), 'WC', None)
D['D'] = pd.Categorical(rng.choice(['a','b','c'], 50, replace=True))
D['E'] = pd.Categorical(rng.choice(range(4,8), 50, replace=True))
M = ModelSpec(terms=['A', 'E', 'C', W, (W, 'D',), U],
- default_encoders=default_encoders)
+ categorical_encoders=default_encoders)
MX = M.fit_transform(D)
print(MX.columns)
MX = np.asarray(MX)
@@ -258,7 +257,11 @@ def test_submodel():
M.fit(D)
MX = M.transform(D)
- MXsub = M.build_submodel(D, M.terms[:2])
+ MXsub = build_model(M.column_info_,
+ D,
+ M.terms[:2],
+ intercept=M.intercept,
+ encoders=M.encoders_)
print(MX.columns)
print(MXsub.columns)
@@ -275,7 +278,11 @@ def test_contrast():
M.fit(D)
MX = M.transform(D)
- MXsub = M.build_submodel(D, M.terms[:2])
+ MXsub = build_model(M.column_info_,
+ D,
+ M.terms[:2],
+ intercept=M.intercept,
+ encoders=M.encoders_)
print(method, MX.columns)
print(MXsub.columns)
@@ -309,7 +316,7 @@ def test_pca():
X = rng.standard_normal((50,8))
D = pd.DataFrame(X, columns=['A','B','C','D','E', 'F', 'G', 'H'])
- pca_ = Variable(('A','B','C','D'), 'pca(ABCD)', PCA(n_components=2))
+ pca_ = Feature(('A','B','C','D'), 'pca(ABCD)', PCA(n_components=2))
M = ModelSpec(terms=[poly('F', intercept=True, degree=3),
pca_])
diff --git a/ISLP/models/tests/test_selection.py b/tests/models/test_selection.py
similarity index 100%
rename from ISLP/models/tests/test_selection.py
rename to tests/models/test_selection.py
diff --git a/tests/models/test_sklearn_wrap.py b/tests/models/test_sklearn_wrap.py
new file mode 100644
index 0000000..c3616bd
--- /dev/null
+++ b/tests/models/test_sklearn_wrap.py
@@ -0,0 +1,46 @@
+
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+from sklearn.base import is_classifier, is_regressor
+import pytest
+
+from ISLP.models.sklearn_wrap import sklearn_sm, sklearn_selected
+from ISLP.models.model_spec import ModelSpec
+from ISLP.models.strategy import min_max
+
+@pytest.fixture
+def model_setup():
+ X = pd.DataFrame({'X1': np.random.rand(10), 'X2': np.random.rand(10), 'X3': np.random.rand(10)})
+ y = pd.Series(np.random.randint(0, 2, 10)) # For classifier
+ model_spec_dummy = ModelSpec(['X1', 'X2', 'X3']).fit(X)
+ min_max_strategy_dummy = min_max(model_spec_dummy, min_terms=1, max_terms=2)
+ return X, y, model_spec_dummy, min_max_strategy_dummy
+
+def test_OLS_is_regressor():
+ model = sklearn_sm(sm.OLS)
+ assert model.__sklearn_tags__().estimator_type == 'regressor'
+ assert is_regressor(model)
+
+def test_GLM_binomial_is_classifier():
+ model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial()})
+ assert model.__sklearn_tags__().estimator_type == 'classifier'
+ assert is_classifier(model)
+
+def test_GLM_binomial_probit_is_classifier():
+ model = sklearn_sm(sm.GLM, model_args={'family': sm.families.Binomial(link=sm.families.links.Probit())})
+ assert model.__sklearn_tags__().estimator_type == 'classifier'
+ assert is_classifier(model)
+
+
+def test_selected_OLS_is_regressor(model_setup):
+ X, y, model_spec_dummy, min_max_strategy_dummy = model_setup
+ model = sklearn_selected(sm.OLS, strategy=min_max_strategy_dummy)
+ assert model.__sklearn_tags__().estimator_type == 'regressor'
+ assert is_regressor(model)
+
+def test_selected_GLM_binomial_is_classifier(model_setup):
+ X, y, model_spec_dummy, min_max_strategy_dummy = model_setup
+ model = sklearn_selected(sm.GLM, strategy=min_max_strategy_dummy, model_args={'family': sm.families.Binomial()})
+ assert model.__sklearn_tags__().estimator_type == 'classifier'
+ assert is_classifier(model)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
new file mode 100644
index 0000000..39f1447
--- /dev/null
+++ b/tests/test_datasets.py
@@ -0,0 +1,30 @@
+# test that all datasets import
+
+from ISLP import load_data
+import numpy as np
+import pytest
+
+datasets = ['Auto',
+ 'Bikeshare',
+ 'Boston',
+ 'BrainCancer',
+ 'Caravan',
+ 'Carseats',
+ 'College',
+ 'Credit',
+ 'Default',
+ 'Fund',
+ 'Hitters',
+ 'NYSE',
+ 'OJ',
+ 'Portfolio',
+ 'Publication',
+ 'Smarket',
+ 'Wage',
+ 'Weekly']
+
+@pytest.mark.parametrize('dataset', datasets)
+def test_load(dataset):
+ df = load_data(dataset)
+ for col in df.columns:
+ assert df[col].dtype != np.dtype(object)
diff --git a/torch_requirements.txt b/torch_requirements.txt
deleted file mode 100644
index f3b355a..0000000
--- a/torch_requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-torch
-torchvision
-torchmetrics
-torchdata
-pytorch_lightning
-torchinfo
diff --git a/versioneer.py b/versioneer.py
deleted file mode 100644
index b4cd1d6..0000000
--- a/versioneer.py
+++ /dev/null
@@ -1,2109 +0,0 @@
-
-# Version: 0.21
-
-"""The Versioneer - like a rocketeer, but for versions.
-
-The Versioneer
-==============
-
-* like a rocketeer, but for versions!
-* https://github.com/python-versioneer/python-versioneer
-* Brian Warner
-* License: Public Domain
-* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3
-* [![Latest Version][pypi-image]][pypi-url]
-* [![Build Status][travis-image]][travis-url]
-
-This is a tool for managing a recorded version number in distutils-based
-python projects. The goal is to remove the tedious and error-prone "update
-the embedded version string" step from your release process. Making a new
-release should be as easy as recording a new tag in your version-control
-system, and maybe making new tarballs.
-
-
-## Quick Install
-
-* `pip install versioneer` to somewhere in your $PATH
-* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md))
-* run `versioneer install` in your source tree, commit the results
-* Verify version information with `python setup.py version`
-
-## Version Identifiers
-
-Source trees come from a variety of places:
-
-* a version-control system checkout (mostly used by developers)
-* a nightly tarball, produced by build automation
-* a snapshot tarball, produced by a web-based VCS browser, like github's
- "tarball from tag" feature
-* a release tarball, produced by "setup.py sdist", distributed through PyPI
-
-Within each source tree, the version identifier (either a string or a number,
-this tool is format-agnostic) can come from a variety of places:
-
-* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
- about recent "tags" and an absolute revision-id
-* the name of the directory into which the tarball was unpacked
-* an expanded VCS keyword ($Id$, etc)
-* a `_version.py` created by some earlier build step
-
-For released software, the version identifier is closely related to a VCS
-tag. Some projects use tag names that include more than just the version
-string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
-needs to strip the tag prefix to extract the version identifier. For
-unreleased software (between tags), the version identifier should provide
-enough information to help developers recreate the same tree, while also
-giving them an idea of roughly how old the tree is (after version 1.2, before
-version 1.3). Many VCS systems can report a description that captures this,
-for example `git describe --tags --dirty --always` reports things like
-"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
-0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
-uncommitted changes).
-
-The version identifier is used for multiple purposes:
-
-* to allow the module to self-identify its version: `myproject.__version__`
-* to choose a name and prefix for a 'setup.py sdist' tarball
-
-## Theory of Operation
-
-Versioneer works by adding a special `_version.py` file into your source
-tree, where your `__init__.py` can import it. This `_version.py` knows how to
-dynamically ask the VCS tool for version information at import time.
-
-`_version.py` also contains `$Revision$` markers, and the installation
-process marks `_version.py` to have this marker rewritten with a tag name
-during the `git archive` command. As a result, generated tarballs will
-contain enough information to get the proper version.
-
-To allow `setup.py` to compute a version too, a `versioneer.py` is added to
-the top level of your source tree, next to `setup.py` and the `setup.cfg`
-that configures it. This overrides several distutils/setuptools commands to
-compute the version when invoked, and changes `setup.py build` and `setup.py
-sdist` to replace `_version.py` with a small static file that contains just
-the generated version data.
-
-## Installation
-
-See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
-
-## Version-String Flavors
-
-Code which uses Versioneer can learn about its version string at runtime by
-importing `_version` from your main `__init__.py` file and running the
-`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
-import the top-level `versioneer.py` and run `get_versions()`.
-
-Both functions return a dictionary with different flavors of version
-information:
-
-* `['version']`: A condensed version string, rendered using the selected
- style. This is the most commonly used value for the project's version
- string. The default "pep440" style yields strings like `0.11`,
- `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
- below for alternative styles.
-
-* `['full-revisionid']`: detailed revision identifier. For Git, this is the
- full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
-
-* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
- commit date in ISO 8601 format. This will be None if the date is not
- available.
-
-* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
- this is only accurate if run in a VCS checkout, otherwise it is likely to
- be False or None
-
-* `['error']`: if the version string could not be computed, this will be set
- to a string describing the problem, otherwise it will be None. It may be
- useful to throw an exception in setup.py if this is set, to avoid e.g.
- creating tarballs with a version string of "unknown".
-
-Some variants are more useful than others. Including `full-revisionid` in a
-bug report should allow developers to reconstruct the exact code being tested
-(or indicate the presence of local changes that should be shared with the
-developers). `version` is suitable for display in an "about" box or a CLI
-`--version` output: it can be easily compared against release notes and lists
-of bugs fixed in various releases.
-
-The installer adds the following text to your `__init__.py` to place a basic
-version in `YOURPROJECT.__version__`:
-
- from ._version import get_versions
- __version__ = get_versions()['version']
- del get_versions
-
-## Styles
-
-The setup.cfg `style=` configuration controls how the VCS information is
-rendered into a version string.
-
-The default style, "pep440", produces a PEP440-compliant string, equal to the
-un-prefixed tag name for actual releases, and containing an additional "local
-version" section with more detail for in-between builds. For Git, this is
-TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
---dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
-tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
-that this commit is two revisions ("+2") beyond the "0.11" tag. For released
-software (exactly equal to a known tag), the identifier will only contain the
-stripped tag, e.g. "0.11".
-
-Other styles are available. See [details.md](details.md) in the Versioneer
-source tree for descriptions.
-
-## Debugging
-
-Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
-to return a version of "0+unknown". To investigate the problem, run `setup.py
-version`, which will run the version-lookup code in a verbose mode, and will
-display the full contents of `get_versions()` (including the `error` string,
-which may help identify what went wrong).
-
-## Known Limitations
-
-Some situations are known to cause problems for Versioneer. This details the
-most significant ones. More can be found on Github
-[issues page](https://github.com/python-versioneer/python-versioneer/issues).
-
-### Subprojects
-
-Versioneer has limited support for source trees in which `setup.py` is not in
-the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
-two common reasons why `setup.py` might not be in the root:
-
-* Source trees which contain multiple subprojects, such as
- [Buildbot](https://github.com/buildbot/buildbot), which contains both
- "master" and "slave" subprojects, each with their own `setup.py`,
- `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
- distributions (and upload multiple independently-installable tarballs).
-* Source trees whose main purpose is to contain a C library, but which also
- provide bindings to Python (and perhaps other languages) in subdirectories.
-
-Versioneer will look for `.git` in parent directories, and most operations
-should get the right version string. However `pip` and `setuptools` have bugs
-and implementation details which frequently cause `pip install .` from a
-subproject directory to fail to find a correct version string (so it usually
-defaults to `0+unknown`).
-
-`pip install --editable .` should work correctly. `setup.py install` might
-work too.
-
-Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
-some later version.
-
-[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking
-this issue. The discussion in
-[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the
-issue from the Versioneer side in more detail.
-[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
-[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
-pip to let Versioneer work correctly.
-
-Versioneer-0.16 and earlier only looked for a `.git` directory next to the
-`setup.cfg`, so subprojects were completely unsupported with those releases.
-
-### Editable installs with setuptools <= 18.5
-
-`setup.py develop` and `pip install --editable .` allow you to install a
-project into a virtualenv once, then continue editing the source code (and
-test) without re-installing after every change.
-
-"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
-convenient way to specify executable scripts that should be installed along
-with the python package.
-
-These both work as expected when using modern setuptools. When using
-setuptools-18.5 or earlier, however, certain operations will cause
-`pkg_resources.DistributionNotFound` errors when running the entrypoint
-script, which must be resolved by re-installing the package. This happens
-when the install happens with one version, then the egg_info data is
-regenerated while a different version is checked out. Many setup.py commands
-cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
-a different virtualenv), so this can be surprising.
-
-[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes
-this one, but upgrading to a newer version of setuptools should probably
-resolve it.
-
-
-## Updating Versioneer
-
-To upgrade your project to a new release of Versioneer, do the following:
-
-* install the new Versioneer (`pip install -U versioneer` or equivalent)
-* edit `setup.cfg`, if necessary, to include any new configuration settings
- indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
-* re-run `versioneer install` in your source tree, to replace
- `SRC/_version.py`
-* commit any changed files
-
-## Future Directions
-
-This tool is designed to make it easily extended to other version-control
-systems: all VCS-specific components are in separate directories like
-src/git/ . The top-level `versioneer.py` script is assembled from these
-components by running make-versioneer.py . In the future, make-versioneer.py
-will take a VCS name as an argument, and will construct a version of
-`versioneer.py` that is specific to the given VCS. It might also take the
-configuration arguments that are currently provided manually during
-installation by editing setup.py . Alternatively, it might go the other
-direction and include code from all supported VCS systems, reducing the
-number of intermediate scripts.
-
-## Similar projects
-
-* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time
- dependency
-* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of
- versioneer
-* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools
- plugin
-
-## License
-
-To make Versioneer easier to embed, all its code is dedicated to the public
-domain. The `_version.py` that it creates is also in the public domain.
-Specifically, both are released under the Creative Commons "Public Domain
-Dedication" license (CC0-1.0), as described in
-https://creativecommons.org/publicdomain/zero/1.0/ .
-
-[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg
-[pypi-url]: https://pypi.python.org/pypi/versioneer/
-[travis-image]:
-https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg
-[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer
-
-"""
-# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring
-# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements
-# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error
-# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with
-# pylint:disable=attribute-defined-outside-init,too-many-arguments
-
-import configparser
-import errno
-import json
-import os
-import re
-import subprocess
-import sys
-from typing import Callable, Dict
-
-
-class VersioneerConfig:
- """Container for Versioneer configuration parameters."""
-
-
-def get_root():
- """Get the project root directory.
-
- We require that all commands are run from the project root, i.e. the
- directory that contains setup.py, setup.cfg, and versioneer.py .
- """
- root = os.path.realpath(os.path.abspath(os.getcwd()))
- setup_py = os.path.join(root, "setup.py")
- versioneer_py = os.path.join(root, "versioneer.py")
- if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
- # allow 'python path/to/setup.py COMMAND'
- root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
- setup_py = os.path.join(root, "setup.py")
- versioneer_py = os.path.join(root, "versioneer.py")
- if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
- err = ("Versioneer was unable to run the project root directory. "
- "Versioneer requires setup.py to be executed from "
- "its immediate directory (like 'python setup.py COMMAND'), "
- "or in a way that lets it use sys.argv[0] to find the root "
- "(like 'python path/to/setup.py COMMAND').")
- raise VersioneerBadRootError(err)
- try:
- # Certain runtime workflows (setup.py install/develop in a setuptools
- # tree) execute all dependencies in a single python process, so
- # "versioneer" may be imported multiple times, and python's shared
- # module-import table will cache the first one. So we can't use
- # os.path.dirname(__file__), as that will find whichever
- # versioneer.py was first imported, even in later projects.
- my_path = os.path.realpath(os.path.abspath(__file__))
- me_dir = os.path.normcase(os.path.splitext(my_path)[0])
- vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
- if me_dir != vsr_dir:
- print("Warning: build in %s is using versioneer.py from %s"
- % (os.path.dirname(my_path), versioneer_py))
- except NameError:
- pass
- return root
-
-
-def get_config_from_root(root):
- """Read the project setup.cfg file to determine Versioneer config."""
- # This might raise OSError (if setup.cfg is missing), or
- # configparser.NoSectionError (if it lacks a [versioneer] section), or
- # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
- # the top of versioneer.py for instructions on writing your setup.cfg .
- setup_cfg = os.path.join(root, "setup.cfg")
- parser = configparser.ConfigParser()
- with open(setup_cfg, "r") as cfg_file:
- parser.read_file(cfg_file)
- VCS = parser.get("versioneer", "VCS") # mandatory
-
- # Dict-like interface for non-mandatory entries
- section = parser["versioneer"]
-
- cfg = VersioneerConfig()
- cfg.VCS = VCS
- cfg.style = section.get("style", "")
- cfg.versionfile_source = section.get("versionfile_source")
- cfg.versionfile_build = section.get("versionfile_build")
- cfg.tag_prefix = section.get("tag_prefix")
- if cfg.tag_prefix in ("''", '""'):
- cfg.tag_prefix = ""
- cfg.parentdir_prefix = section.get("parentdir_prefix")
- cfg.verbose = section.get("verbose")
- return cfg
-
-
-class NotThisMethod(Exception):
- """Exception raised if a method is not valid for the current scenario."""
-
-
-# these dictionaries contain VCS-specific tools
-LONG_VERSION_PY: Dict[str, str] = {}
-HANDLERS: Dict[str, Dict[str, Callable]] = {}
-
-
-def register_vcs_handler(vcs, method): # decorator
- """Create decorator to mark a method as the handler of a VCS."""
- def decorate(f):
- """Store f in HANDLERS[vcs][method]."""
- HANDLERS.setdefault(vcs, {})[method] = f
- return f
- return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
- env=None):
- """Call the given command(s)."""
- assert isinstance(commands, list)
- process = None
- for command in commands:
- try:
- dispcmd = str([command] + args)
- # remember shell=False, so use git.cmd on windows, not just git
- process = subprocess.Popen([command] + args, cwd=cwd, env=env,
- stdout=subprocess.PIPE,
- stderr=(subprocess.PIPE if hide_stderr
- else None))
- break
- except OSError:
- e = sys.exc_info()[1]
- if e.errno == errno.ENOENT:
- continue
- if verbose:
- print("unable to run %s" % dispcmd)
- print(e)
- return None, None
- else:
- if verbose:
- print("unable to find command, tried %s" % (commands,))
- return None, None
- stdout = process.communicate()[0].strip().decode()
- if process.returncode != 0:
- if verbose:
- print("unable to run %s (error)" % dispcmd)
- print("stdout was %s" % stdout)
- return None, process.returncode
- return stdout, process.returncode
-
-
-LONG_VERSION_PY['git'] = r'''
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain. Generated by
-# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer)
-
-"""Git implementation of _version.py."""
-
-import errno
-import os
-import re
-import subprocess
-import sys
-from typing import Callable, Dict
-
-
-def get_keywords():
- """Get the keywords needed to look up the version information."""
- # these strings will be replaced by git during git-archive.
- # setup.py/versioneer.py will grep for the variable names, so they must
- # each be defined on a line of their own. _version.py will just call
- # get_keywords().
- git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
- git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
- git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
- keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
- return keywords
-
-
-class VersioneerConfig:
- """Container for Versioneer configuration parameters."""
-
-
-def get_config():
- """Create, populate and return the VersioneerConfig() object."""
- # these strings are filled in when 'setup.py versioneer' creates
- # _version.py
- cfg = VersioneerConfig()
- cfg.VCS = "git"
- cfg.style = "%(STYLE)s"
- cfg.tag_prefix = "%(TAG_PREFIX)s"
- cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
- cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
- cfg.verbose = False
- return cfg
-
-
-class NotThisMethod(Exception):
- """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY: Dict[str, str] = {}
-HANDLERS: Dict[str, Dict[str, Callable]] = {}
-
-
-def register_vcs_handler(vcs, method): # decorator
- """Create decorator to mark a method as the handler of a VCS."""
- def decorate(f):
- """Store f in HANDLERS[vcs][method]."""
- if vcs not in HANDLERS:
- HANDLERS[vcs] = {}
- HANDLERS[vcs][method] = f
- return f
- return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
- env=None):
- """Call the given command(s)."""
- assert isinstance(commands, list)
- process = None
- for command in commands:
- try:
- dispcmd = str([command] + args)
- # remember shell=False, so use git.cmd on windows, not just git
- process = subprocess.Popen([command] + args, cwd=cwd, env=env,
- stdout=subprocess.PIPE,
- stderr=(subprocess.PIPE if hide_stderr
- else None))
- break
- except OSError:
- e = sys.exc_info()[1]
- if e.errno == errno.ENOENT:
- continue
- if verbose:
- print("unable to run %%s" %% dispcmd)
- print(e)
- return None, None
- else:
- if verbose:
- print("unable to find command, tried %%s" %% (commands,))
- return None, None
- stdout = process.communicate()[0].strip().decode()
- if process.returncode != 0:
- if verbose:
- print("unable to run %%s (error)" %% dispcmd)
- print("stdout was %%s" %% stdout)
- return None, process.returncode
- return stdout, process.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
- """Try to determine the version from the parent directory name.
-
- Source tarballs conventionally unpack into a directory that includes both
- the project name and a version string. We will also support searching up
- two directory levels for an appropriately named parent directory
- """
- rootdirs = []
-
- for _ in range(3):
- dirname = os.path.basename(root)
- if dirname.startswith(parentdir_prefix):
- return {"version": dirname[len(parentdir_prefix):],
- "full-revisionid": None,
- "dirty": False, "error": None, "date": None}
- rootdirs.append(root)
- root = os.path.dirname(root) # up a level
-
- if verbose:
- print("Tried directories %%s but none started with prefix %%s" %%
- (str(rootdirs), parentdir_prefix))
- raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
- """Extract version information from the given file."""
- # the code embedded in _version.py can just fetch the value of these
- # keywords. When used from setup.py, we don't want to import _version.py,
- # so we do it with a regexp instead. This function is not used from
- # _version.py.
- keywords = {}
- try:
- with open(versionfile_abs, "r") as fobj:
- for line in fobj:
- if line.strip().startswith("git_refnames ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["refnames"] = mo.group(1)
- if line.strip().startswith("git_full ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["full"] = mo.group(1)
- if line.strip().startswith("git_date ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["date"] = mo.group(1)
- except OSError:
- pass
- return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
- """Get version information from git keywords."""
- if "refnames" not in keywords:
- raise NotThisMethod("Short version file found")
- date = keywords.get("date")
- if date is not None:
- # Use only the last line. Previous lines may contain GPG signature
- # information.
- date = date.splitlines()[-1]
-
- # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
- # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
- # -like" string, which we must then edit to make compliant), because
- # it's been around since git-1.5.3, and it's too difficult to
- # discover which version we're using, or to work around using an
- # older one.
- date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
- refnames = keywords["refnames"].strip()
- if refnames.startswith("$Format"):
- if verbose:
- print("keywords are unexpanded, not using")
- raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
- refs = {r.strip() for r in refnames.strip("()").split(",")}
- # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
- # just "foo-1.0". If we see a "tag: " prefix, prefer those.
- TAG = "tag: "
- tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
- if not tags:
- # Either we're using git < 1.8.3, or there really are no tags. We use
- # a heuristic: assume all version tags have a digit. The old git %%d
- # expansion behaves like git log --decorate=short and strips out the
- # refs/heads/ and refs/tags/ prefixes that would let us distinguish
- # between branches and tags. By ignoring refnames without digits, we
- # filter out many common branch names like "release" and
- # "stabilization", as well as "HEAD" and "master".
- tags = {r for r in refs if re.search(r'\d', r)}
- if verbose:
- print("discarding '%%s', no digits" %% ",".join(refs - tags))
- if verbose:
- print("likely tags: %%s" %% ",".join(sorted(tags)))
- for ref in sorted(tags):
- # sorting will prefer e.g. "2.0" over "2.0rc1"
- if ref.startswith(tag_prefix):
- r = ref[len(tag_prefix):]
- # Filter out refs that exactly match prefix or that don't start
- # with a number once the prefix is stripped (mostly a concern
- # when prefix is '')
- if not re.match(r'\d', r):
- continue
- if verbose:
- print("picking %%s" %% r)
- return {"version": r,
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": None,
- "date": date}
- # no suitable tags, so version is "0+unknown", but full hex is still there
- if verbose:
- print("no suitable tags, using unknown + full revision id")
- return {"version": "0+unknown",
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": "no suitable tags", "date": None}
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
- """Get version from 'git describe' in the root of the source tree.
-
- This only gets called if the git-archive 'subst' keywords were *not*
- expanded, and _version.py hasn't already been rewritten with a short
- version string, meaning we're inside a checked out source tree.
- """
- GITS = ["git"]
- TAG_PREFIX_REGEX = "*"
- if sys.platform == "win32":
- GITS = ["git.cmd", "git.exe"]
- TAG_PREFIX_REGEX = r"\*"
-
- _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
- hide_stderr=True)
- if rc != 0:
- if verbose:
- print("Directory %%s not under git control" %% root)
- raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
- # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
- # if there isn't one, this yields HEX[-dirty] (no NUM)
- describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty",
- "--always", "--long",
- "--match",
- "%%s%%s" %% (tag_prefix, TAG_PREFIX_REGEX)],
- cwd=root)
- # --long was added in git-1.5.5
- if describe_out is None:
- raise NotThisMethod("'git describe' failed")
- describe_out = describe_out.strip()
- full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
- if full_out is None:
- raise NotThisMethod("'git rev-parse' failed")
- full_out = full_out.strip()
-
- pieces = {}
- pieces["long"] = full_out
- pieces["short"] = full_out[:7] # maybe improved later
- pieces["error"] = None
-
- branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
- cwd=root)
- # --abbrev-ref was added in git-1.6.3
- if rc != 0 or branch_name is None:
- raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
- branch_name = branch_name.strip()
-
- if branch_name == "HEAD":
- # If we aren't exactly on a branch, pick a branch which represents
- # the current commit. If all else fails, we are on a branchless
- # commit.
- branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
- # --contains was added in git-1.5.4
- if rc != 0 or branches is None:
- raise NotThisMethod("'git branch --contains' returned error")
- branches = branches.split("\n")
-
- # Remove the first line if we're running detached
- if "(" in branches[0]:
- branches.pop(0)
-
- # Strip off the leading "* " from the list of branches.
- branches = [branch[2:] for branch in branches]
- if "master" in branches:
- branch_name = "master"
- elif not branches:
- branch_name = None
- else:
- # Pick the first branch that is returned. Good or bad.
- branch_name = branches[0]
-
- pieces["branch"] = branch_name
-
- # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
- # TAG might have hyphens.
- git_describe = describe_out
-
- # look for -dirty suffix
- dirty = git_describe.endswith("-dirty")
- pieces["dirty"] = dirty
- if dirty:
- git_describe = git_describe[:git_describe.rindex("-dirty")]
-
- # now we have TAG-NUM-gHEX or HEX
-
- if "-" in git_describe:
- # TAG-NUM-gHEX
- mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
- if not mo:
- # unparsable. Maybe git-describe is misbehaving?
- pieces["error"] = ("unable to parse git-describe output: '%%s'"
- %% describe_out)
- return pieces
-
- # tag
- full_tag = mo.group(1)
- if not full_tag.startswith(tag_prefix):
- if verbose:
- fmt = "tag '%%s' doesn't start with prefix '%%s'"
- print(fmt %% (full_tag, tag_prefix))
- pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
- %% (full_tag, tag_prefix))
- return pieces
- pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
- # distance: number of commits since tag
- pieces["distance"] = int(mo.group(2))
-
- # commit: short hex revision ID
- pieces["short"] = mo.group(3)
-
- else:
- # HEX: no tags
- pieces["closest-tag"] = None
- count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
- pieces["distance"] = int(count_out) # total number of commits
-
- # commit date: see ISO-8601 comment in git_versions_from_keywords()
- date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip()
- # Use only the last line. Previous lines may contain GPG signature
- # information.
- date = date.splitlines()[-1]
- pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
- return pieces
-
-
-def plus_or_dot(pieces):
- """Return a + if we don't already have one, else return a ."""
- if "+" in pieces.get("closest-tag", ""):
- return "."
- return "+"
-
-
-def render_pep440(pieces):
- """Build up version string, with post-release "local version identifier".
-
- Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
- get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
- Exceptions:
- 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += plus_or_dot(pieces)
- rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- else:
- # exception #1
- rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
- pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- return rendered
-
-
-def render_pep440_branch(pieces):
- """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
-
- The ".dev0" means not master branch. Note that .dev0 sorts backwards
- (a feature branch will appear "older" than the master branch).
-
- Exceptions:
- 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += plus_or_dot(pieces)
- rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- else:
- # exception #1
- rendered = "0"
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += "+untagged.%%d.g%%s" %% (pieces["distance"],
- pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- return rendered
-
-
-def pep440_split_post(ver):
- """Split pep440 version string at the post-release segment.
-
- Returns the release segments before the post-release and the
- post-release version number (or -1 if no post-release segment is present).
- """
- vc = str.split(ver, ".post")
- return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
-
-
-def render_pep440_pre(pieces):
- """TAG[.postN.devDISTANCE] -- No -dirty.
-
- Exceptions:
- 1: no tags. 0.post0.devDISTANCE
- """
- if pieces["closest-tag"]:
- if pieces["distance"]:
- # update the post release segment
- tag_version, post_version = pep440_split_post(pieces["closest-tag"])
- rendered = tag_version
- if post_version is not None:
- rendered += ".post%%d.dev%%d" %% (post_version+1, pieces["distance"])
- else:
- rendered += ".post0.dev%%d" %% (pieces["distance"])
- else:
- # no commits, use the tag as the version
- rendered = pieces["closest-tag"]
- else:
- # exception #1
- rendered = "0.post0.dev%%d" %% pieces["distance"]
- return rendered
-
-
-def render_pep440_post(pieces):
- """TAG[.postDISTANCE[.dev0]+gHEX] .
-
- The ".dev0" means dirty. Note that .dev0 sorts backwards
- (a dirty tree will appear "older" than the corresponding clean one),
- but you shouldn't be releasing software with -dirty anyways.
-
- Exceptions:
- 1: no tags. 0.postDISTANCE[.dev0]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += ".post%%d" %% pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- rendered += plus_or_dot(pieces)
- rendered += "g%%s" %% pieces["short"]
- else:
- # exception #1
- rendered = "0.post%%d" %% pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- rendered += "+g%%s" %% pieces["short"]
- return rendered
-
-
-def render_pep440_post_branch(pieces):
- """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
-
- The ".dev0" means not master branch.
-
- Exceptions:
- 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += ".post%%d" %% pieces["distance"]
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += plus_or_dot(pieces)
- rendered += "g%%s" %% pieces["short"]
- if pieces["dirty"]:
- rendered += ".dirty"
- else:
- # exception #1
- rendered = "0.post%%d" %% pieces["distance"]
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += "+g%%s" %% pieces["short"]
- if pieces["dirty"]:
- rendered += ".dirty"
- return rendered
-
-
-def render_pep440_old(pieces):
- """TAG[.postDISTANCE[.dev0]] .
-
- The ".dev0" means dirty.
-
- Exceptions:
- 1: no tags. 0.postDISTANCE[.dev0]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += ".post%%d" %% pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- else:
- # exception #1
- rendered = "0.post%%d" %% pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- return rendered
-
-
-def render_git_describe(pieces):
- """TAG[-DISTANCE-gHEX][-dirty].
-
- Like 'git describe --tags --dirty --always'.
-
- Exceptions:
- 1: no tags. HEX[-dirty] (note: no 'g' prefix)
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"]:
- rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
- else:
- # exception #1
- rendered = pieces["short"]
- if pieces["dirty"]:
- rendered += "-dirty"
- return rendered
-
-
-def render_git_describe_long(pieces):
- """TAG-DISTANCE-gHEX[-dirty].
-
- Like 'git describe --tags --dirty --always -long'.
- The distance/hash is unconditional.
-
- Exceptions:
- 1: no tags. HEX[-dirty] (note: no 'g' prefix)
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
- else:
- # exception #1
- rendered = pieces["short"]
- if pieces["dirty"]:
- rendered += "-dirty"
- return rendered
-
-
-def render(pieces, style):
- """Render the given version pieces into the requested style."""
- if pieces["error"]:
- return {"version": "unknown",
- "full-revisionid": pieces.get("long"),
- "dirty": None,
- "error": pieces["error"],
- "date": None}
-
- if not style or style == "default":
- style = "pep440" # the default
-
- if style == "pep440":
- rendered = render_pep440(pieces)
- elif style == "pep440-branch":
- rendered = render_pep440_branch(pieces)
- elif style == "pep440-pre":
- rendered = render_pep440_pre(pieces)
- elif style == "pep440-post":
- rendered = render_pep440_post(pieces)
- elif style == "pep440-post-branch":
- rendered = render_pep440_post_branch(pieces)
- elif style == "pep440-old":
- rendered = render_pep440_old(pieces)
- elif style == "git-describe":
- rendered = render_git_describe(pieces)
- elif style == "git-describe-long":
- rendered = render_git_describe_long(pieces)
- else:
- raise ValueError("unknown style '%%s'" %% style)
-
- return {"version": rendered, "full-revisionid": pieces["long"],
- "dirty": pieces["dirty"], "error": None,
- "date": pieces.get("date")}
-
-
-def get_versions():
- """Get version information or return default if unable to do so."""
- # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
- # __file__, we can work backwards from there to the root. Some
- # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
- # case we can only use expanded keywords.
-
- cfg = get_config()
- verbose = cfg.verbose
-
- try:
- return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
- verbose)
- except NotThisMethod:
- pass
-
- try:
- root = os.path.realpath(__file__)
- # versionfile_source is the relative path from the top of the source
- # tree (where the .git directory might live) to this file. Invert
- # this to find the root from __file__.
- for _ in cfg.versionfile_source.split('/'):
- root = os.path.dirname(root)
- except NameError:
- return {"version": "0+unknown", "full-revisionid": None,
- "dirty": None,
- "error": "unable to find root of source tree",
- "date": None}
-
- try:
- pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
- return render(pieces, cfg.style)
- except NotThisMethod:
- pass
-
- try:
- if cfg.parentdir_prefix:
- return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
- except NotThisMethod:
- pass
-
- return {"version": "0+unknown", "full-revisionid": None,
- "dirty": None,
- "error": "unable to compute version", "date": None}
-'''
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
- """Extract version information from the given file."""
- # the code embedded in _version.py can just fetch the value of these
- # keywords. When used from setup.py, we don't want to import _version.py,
- # so we do it with a regexp instead. This function is not used from
- # _version.py.
- keywords = {}
- try:
- with open(versionfile_abs, "r") as fobj:
- for line in fobj:
- if line.strip().startswith("git_refnames ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["refnames"] = mo.group(1)
- if line.strip().startswith("git_full ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["full"] = mo.group(1)
- if line.strip().startswith("git_date ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["date"] = mo.group(1)
- except OSError:
- pass
- return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
- """Get version information from git keywords."""
- if "refnames" not in keywords:
- raise NotThisMethod("Short version file found")
- date = keywords.get("date")
- if date is not None:
- # Use only the last line. Previous lines may contain GPG signature
- # information.
- date = date.splitlines()[-1]
-
- # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
- # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
- # -like" string, which we must then edit to make compliant), because
- # it's been around since git-1.5.3, and it's too difficult to
- # discover which version we're using, or to work around using an
- # older one.
- date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
- refnames = keywords["refnames"].strip()
- if refnames.startswith("$Format"):
- if verbose:
- print("keywords are unexpanded, not using")
- raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
- refs = {r.strip() for r in refnames.strip("()").split(",")}
- # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
- # just "foo-1.0". If we see a "tag: " prefix, prefer those.
- TAG = "tag: "
- tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
- if not tags:
- # Either we're using git < 1.8.3, or there really are no tags. We use
- # a heuristic: assume all version tags have a digit. The old git %d
- # expansion behaves like git log --decorate=short and strips out the
- # refs/heads/ and refs/tags/ prefixes that would let us distinguish
- # between branches and tags. By ignoring refnames without digits, we
- # filter out many common branch names like "release" and
- # "stabilization", as well as "HEAD" and "master".
- tags = {r for r in refs if re.search(r'\d', r)}
- if verbose:
- print("discarding '%s', no digits" % ",".join(refs - tags))
- if verbose:
- print("likely tags: %s" % ",".join(sorted(tags)))
- for ref in sorted(tags):
- # sorting will prefer e.g. "2.0" over "2.0rc1"
- if ref.startswith(tag_prefix):
- r = ref[len(tag_prefix):]
- # Filter out refs that exactly match prefix or that don't start
- # with a number once the prefix is stripped (mostly a concern
- # when prefix is '')
- if not re.match(r'\d', r):
- continue
- if verbose:
- print("picking %s" % r)
- return {"version": r,
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": None,
- "date": date}
- # no suitable tags, so version is "0+unknown", but full hex is still there
- if verbose:
- print("no suitable tags, using unknown + full revision id")
- return {"version": "0+unknown",
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": "no suitable tags", "date": None}
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
- """Get version from 'git describe' in the root of the source tree.
-
- This only gets called if the git-archive 'subst' keywords were *not*
- expanded, and _version.py hasn't already been rewritten with a short
- version string, meaning we're inside a checked out source tree.
- """
- GITS = ["git"]
- TAG_PREFIX_REGEX = "*"
- if sys.platform == "win32":
- GITS = ["git.cmd", "git.exe"]
- TAG_PREFIX_REGEX = r"\*"
-
- _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
- hide_stderr=True)
- if rc != 0:
- if verbose:
- print("Directory %s not under git control" % root)
- raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
- # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
- # if there isn't one, this yields HEX[-dirty] (no NUM)
- describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty",
- "--always", "--long",
- "--match",
- "%s%s" % (tag_prefix, TAG_PREFIX_REGEX)],
- cwd=root)
- # --long was added in git-1.5.5
- if describe_out is None:
- raise NotThisMethod("'git describe' failed")
- describe_out = describe_out.strip()
- full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
- if full_out is None:
- raise NotThisMethod("'git rev-parse' failed")
- full_out = full_out.strip()
-
- pieces = {}
- pieces["long"] = full_out
- pieces["short"] = full_out[:7] # maybe improved later
- pieces["error"] = None
-
- branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
- cwd=root)
- # --abbrev-ref was added in git-1.6.3
- if rc != 0 or branch_name is None:
- raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
- branch_name = branch_name.strip()
-
- if branch_name == "HEAD":
- # If we aren't exactly on a branch, pick a branch which represents
- # the current commit. If all else fails, we are on a branchless
- # commit.
- branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
- # --contains was added in git-1.5.4
- if rc != 0 or branches is None:
- raise NotThisMethod("'git branch --contains' returned error")
- branches = branches.split("\n")
-
- # Remove the first line if we're running detached
- if "(" in branches[0]:
- branches.pop(0)
-
- # Strip off the leading "* " from the list of branches.
- branches = [branch[2:] for branch in branches]
- if "master" in branches:
- branch_name = "master"
- elif not branches:
- branch_name = None
- else:
- # Pick the first branch that is returned. Good or bad.
- branch_name = branches[0]
-
- pieces["branch"] = branch_name
-
- # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
- # TAG might have hyphens.
- git_describe = describe_out
-
- # look for -dirty suffix
- dirty = git_describe.endswith("-dirty")
- pieces["dirty"] = dirty
- if dirty:
- git_describe = git_describe[:git_describe.rindex("-dirty")]
-
- # now we have TAG-NUM-gHEX or HEX
-
- if "-" in git_describe:
- # TAG-NUM-gHEX
- mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
- if not mo:
- # unparsable. Maybe git-describe is misbehaving?
- pieces["error"] = ("unable to parse git-describe output: '%s'"
- % describe_out)
- return pieces
-
- # tag
- full_tag = mo.group(1)
- if not full_tag.startswith(tag_prefix):
- if verbose:
- fmt = "tag '%s' doesn't start with prefix '%s'"
- print(fmt % (full_tag, tag_prefix))
- pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
- % (full_tag, tag_prefix))
- return pieces
- pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
- # distance: number of commits since tag
- pieces["distance"] = int(mo.group(2))
-
- # commit: short hex revision ID
- pieces["short"] = mo.group(3)
-
- else:
- # HEX: no tags
- pieces["closest-tag"] = None
- count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
- pieces["distance"] = int(count_out) # total number of commits
-
- # commit date: see ISO-8601 comment in git_versions_from_keywords()
- date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
- # Use only the last line. Previous lines may contain GPG signature
- # information.
- date = date.splitlines()[-1]
- pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
- return pieces
-
-
-def do_vcs_install(manifest_in, versionfile_source, ipy):
- """Git-specific installation logic for Versioneer.
-
- For Git, this means creating/changing .gitattributes to mark _version.py
- for export-subst keyword substitution.
- """
- GITS = ["git"]
- if sys.platform == "win32":
- GITS = ["git.cmd", "git.exe"]
- files = [manifest_in, versionfile_source]
- if ipy:
- files.append(ipy)
- try:
- my_path = __file__
- if my_path.endswith(".pyc") or my_path.endswith(".pyo"):
- my_path = os.path.splitext(my_path)[0] + ".py"
- versioneer_file = os.path.relpath(my_path)
- except NameError:
- versioneer_file = "versioneer.py"
- files.append(versioneer_file)
- present = False
- try:
- with open(".gitattributes", "r") as fobj:
- for line in fobj:
- if line.strip().startswith(versionfile_source):
- if "export-subst" in line.strip().split()[1:]:
- present = True
- break
- except OSError:
- pass
- if not present:
- with open(".gitattributes", "a+") as fobj:
- fobj.write(f"{versionfile_source} export-subst\n")
- files.append(".gitattributes")
- run_command(GITS, ["add", "--"] + files)
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
- """Try to determine the version from the parent directory name.
-
- Source tarballs conventionally unpack into a directory that includes both
- the project name and a version string. We will also support searching up
- two directory levels for an appropriately named parent directory
- """
- rootdirs = []
-
- for _ in range(3):
- dirname = os.path.basename(root)
- if dirname.startswith(parentdir_prefix):
- return {"version": dirname[len(parentdir_prefix):],
- "full-revisionid": None,
- "dirty": False, "error": None, "date": None}
- rootdirs.append(root)
- root = os.path.dirname(root) # up a level
-
- if verbose:
- print("Tried directories %s but none started with prefix %s" %
- (str(rootdirs), parentdir_prefix))
- raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-SHORT_VERSION_PY = """
-# This file was generated by 'versioneer.py' (0.21) from
-# revision-control system data, or from the parent directory name of an
-# unpacked source archive. Distribution tarballs contain a pre-generated copy
-# of this file.
-
-import json
-
-version_json = '''
-%s
-''' # END VERSION_JSON
-
-
-def get_versions():
- return json.loads(version_json)
-"""
-
-
-def versions_from_file(filename):
- """Try to determine the version from _version.py if present."""
- try:
- with open(filename) as f:
- contents = f.read()
- except OSError:
- raise NotThisMethod("unable to read _version.py")
- mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON",
- contents, re.M | re.S)
- if not mo:
- mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON",
- contents, re.M | re.S)
- if not mo:
- raise NotThisMethod("no version_json in _version.py")
- return json.loads(mo.group(1))
-
-
-def write_to_version_file(filename, versions):
- """Write the given version number to the given _version.py file."""
- os.unlink(filename)
- contents = json.dumps(versions, sort_keys=True,
- indent=1, separators=(",", ": "))
- with open(filename, "w") as f:
- f.write(SHORT_VERSION_PY % contents)
-
- print("set %s to '%s'" % (filename, versions["version"]))
-
-
-def plus_or_dot(pieces):
- """Return a + if we don't already have one, else return a ."""
- if "+" in pieces.get("closest-tag", ""):
- return "."
- return "+"
-
-
-def render_pep440(pieces):
- """Build up version string, with post-release "local version identifier".
-
- Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
- get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
- Exceptions:
- 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += plus_or_dot(pieces)
- rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- else:
- # exception #1
- rendered = "0+untagged.%d.g%s" % (pieces["distance"],
- pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- return rendered
-
-
-def render_pep440_branch(pieces):
- """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
-
- The ".dev0" means not master branch. Note that .dev0 sorts backwards
- (a feature branch will appear "older" than the master branch).
-
- Exceptions:
- 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += plus_or_dot(pieces)
- rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- else:
- # exception #1
- rendered = "0"
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += "+untagged.%d.g%s" % (pieces["distance"],
- pieces["short"])
- if pieces["dirty"]:
- rendered += ".dirty"
- return rendered
-
-
-def pep440_split_post(ver):
- """Split pep440 version string at the post-release segment.
-
- Returns the release segments before the post-release and the
- post-release version number (or -1 if no post-release segment is present).
- """
- vc = str.split(ver, ".post")
- return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
-
-
-def render_pep440_pre(pieces):
- """TAG[.postN.devDISTANCE] -- No -dirty.
-
- Exceptions:
- 1: no tags. 0.post0.devDISTANCE
- """
- if pieces["closest-tag"]:
- if pieces["distance"]:
- # update the post release segment
- tag_version, post_version = pep440_split_post(pieces["closest-tag"])
- rendered = tag_version
- if post_version is not None:
- rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"])
- else:
- rendered += ".post0.dev%d" % (pieces["distance"])
- else:
- # no commits, use the tag as the version
- rendered = pieces["closest-tag"]
- else:
- # exception #1
- rendered = "0.post0.dev%d" % pieces["distance"]
- return rendered
-
-
-def render_pep440_post(pieces):
- """TAG[.postDISTANCE[.dev0]+gHEX] .
-
- The ".dev0" means dirty. Note that .dev0 sorts backwards
- (a dirty tree will appear "older" than the corresponding clean one),
- but you shouldn't be releasing software with -dirty anyways.
-
- Exceptions:
- 1: no tags. 0.postDISTANCE[.dev0]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += ".post%d" % pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- rendered += plus_or_dot(pieces)
- rendered += "g%s" % pieces["short"]
- else:
- # exception #1
- rendered = "0.post%d" % pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- rendered += "+g%s" % pieces["short"]
- return rendered
-
-
-def render_pep440_post_branch(pieces):
- """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
-
- The ".dev0" means not master branch.
-
- Exceptions:
- 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += ".post%d" % pieces["distance"]
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += plus_or_dot(pieces)
- rendered += "g%s" % pieces["short"]
- if pieces["dirty"]:
- rendered += ".dirty"
- else:
- # exception #1
- rendered = "0.post%d" % pieces["distance"]
- if pieces["branch"] != "master":
- rendered += ".dev0"
- rendered += "+g%s" % pieces["short"]
- if pieces["dirty"]:
- rendered += ".dirty"
- return rendered
-
-
-def render_pep440_old(pieces):
- """TAG[.postDISTANCE[.dev0]] .
-
- The ".dev0" means dirty.
-
- Exceptions:
- 1: no tags. 0.postDISTANCE[.dev0]
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"] or pieces["dirty"]:
- rendered += ".post%d" % pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- else:
- # exception #1
- rendered = "0.post%d" % pieces["distance"]
- if pieces["dirty"]:
- rendered += ".dev0"
- return rendered
-
-
-def render_git_describe(pieces):
- """TAG[-DISTANCE-gHEX][-dirty].
-
- Like 'git describe --tags --dirty --always'.
-
- Exceptions:
- 1: no tags. HEX[-dirty] (note: no 'g' prefix)
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- if pieces["distance"]:
- rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
- else:
- # exception #1
- rendered = pieces["short"]
- if pieces["dirty"]:
- rendered += "-dirty"
- return rendered
-
-
-def render_git_describe_long(pieces):
- """TAG-DISTANCE-gHEX[-dirty].
-
- Like 'git describe --tags --dirty --always -long'.
- The distance/hash is unconditional.
-
- Exceptions:
- 1: no tags. HEX[-dirty] (note: no 'g' prefix)
- """
- if pieces["closest-tag"]:
- rendered = pieces["closest-tag"]
- rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
- else:
- # exception #1
- rendered = pieces["short"]
- if pieces["dirty"]:
- rendered += "-dirty"
- return rendered
-
-
-def render(pieces, style):
- """Render the given version pieces into the requested style."""
- if pieces["error"]:
- return {"version": "unknown",
- "full-revisionid": pieces.get("long"),
- "dirty": None,
- "error": pieces["error"],
- "date": None}
-
- if not style or style == "default":
- style = "pep440" # the default
-
- if style == "pep440":
- rendered = render_pep440(pieces)
- elif style == "pep440-branch":
- rendered = render_pep440_branch(pieces)
- elif style == "pep440-pre":
- rendered = render_pep440_pre(pieces)
- elif style == "pep440-post":
- rendered = render_pep440_post(pieces)
- elif style == "pep440-post-branch":
- rendered = render_pep440_post_branch(pieces)
- elif style == "pep440-old":
- rendered = render_pep440_old(pieces)
- elif style == "git-describe":
- rendered = render_git_describe(pieces)
- elif style == "git-describe-long":
- rendered = render_git_describe_long(pieces)
- else:
- raise ValueError("unknown style '%s'" % style)
-
- return {"version": rendered, "full-revisionid": pieces["long"],
- "dirty": pieces["dirty"], "error": None,
- "date": pieces.get("date")}
-
-
-class VersioneerBadRootError(Exception):
- """The project root directory is unknown or missing key files."""
-
-
-def get_versions(verbose=False):
- """Get the project version from whatever source is available.
-
- Returns dict with two keys: 'version' and 'full'.
- """
- if "versioneer" in sys.modules:
- # see the discussion in cmdclass.py:get_cmdclass()
- del sys.modules["versioneer"]
-
- root = get_root()
- cfg = get_config_from_root(root)
-
- assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
- handlers = HANDLERS.get(cfg.VCS)
- assert handlers, "unrecognized VCS '%s'" % cfg.VCS
- verbose = verbose or cfg.verbose
- assert cfg.versionfile_source is not None, \
- "please set versioneer.versionfile_source"
- assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
-
- versionfile_abs = os.path.join(root, cfg.versionfile_source)
-
- # extract version from first of: _version.py, VCS command (e.g. 'git
- # describe'), parentdir. This is meant to work for developers using a
- # source checkout, for users of a tarball created by 'setup.py sdist',
- # and for users of a tarball/zipball created by 'git archive' or github's
- # download-from-tag feature or the equivalent in other VCSes.
-
- get_keywords_f = handlers.get("get_keywords")
- from_keywords_f = handlers.get("keywords")
- if get_keywords_f and from_keywords_f:
- try:
- keywords = get_keywords_f(versionfile_abs)
- ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
- if verbose:
- print("got version from expanded keyword %s" % ver)
- return ver
- except NotThisMethod:
- pass
-
- try:
- ver = versions_from_file(versionfile_abs)
- if verbose:
- print("got version from file %s %s" % (versionfile_abs, ver))
- return ver
- except NotThisMethod:
- pass
-
- from_vcs_f = handlers.get("pieces_from_vcs")
- if from_vcs_f:
- try:
- pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
- ver = render(pieces, cfg.style)
- if verbose:
- print("got version from VCS %s" % ver)
- return ver
- except NotThisMethod:
- pass
-
- try:
- if cfg.parentdir_prefix:
- ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
- if verbose:
- print("got version from parentdir %s" % ver)
- return ver
- except NotThisMethod:
- pass
-
- if verbose:
- print("unable to compute version")
-
- return {"version": "0+unknown", "full-revisionid": None,
- "dirty": None, "error": "unable to compute version",
- "date": None}
-
-
-def get_version():
- """Get the short version string for this project."""
- return get_versions()["version"]
-
-
-def get_cmdclass(cmdclass=None):
- """Get the custom setuptools/distutils subclasses used by Versioneer.
-
- If the package uses a different cmdclass (e.g. one from numpy), it
- should be provide as an argument.
- """
- if "versioneer" in sys.modules:
- del sys.modules["versioneer"]
- # this fixes the "python setup.py develop" case (also 'install' and
- # 'easy_install .'), in which subdependencies of the main project are
- # built (using setup.py bdist_egg) in the same python process. Assume
- # a main project A and a dependency B, which use different versions
- # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
- # sys.modules by the time B's setup.py is executed, causing B to run
- # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
- # sandbox that restores sys.modules to it's pre-build state, so the
- # parent is protected against the child's "import versioneer". By
- # removing ourselves from sys.modules here, before the child build
- # happens, we protect the child from the parent's versioneer too.
- # Also see https://github.com/python-versioneer/python-versioneer/issues/52
-
- cmds = {} if cmdclass is None else cmdclass.copy()
-
- # we add "version" to both distutils and setuptools
- from distutils.core import Command
-
- class cmd_version(Command):
- description = "report generated version string"
- user_options = []
- boolean_options = []
-
- def initialize_options(self):
- pass
-
- def finalize_options(self):
- pass
-
- def run(self):
- vers = get_versions(verbose=True)
- print("Version: %s" % vers["version"])
- print(" full-revisionid: %s" % vers.get("full-revisionid"))
- print(" dirty: %s" % vers.get("dirty"))
- print(" date: %s" % vers.get("date"))
- if vers["error"]:
- print(" error: %s" % vers["error"])
- cmds["version"] = cmd_version
-
- # we override "build_py" in both distutils and setuptools
- #
- # most invocation pathways end up running build_py:
- # distutils/build -> build_py
- # distutils/install -> distutils/build ->..
- # setuptools/bdist_wheel -> distutils/install ->..
- # setuptools/bdist_egg -> distutils/install_lib -> build_py
- # setuptools/install -> bdist_egg ->..
- # setuptools/develop -> ?
- # pip install:
- # copies source tree to a tempdir before running egg_info/etc
- # if .git isn't copied too, 'git describe' will fail
- # then does setup.py bdist_wheel, or sometimes setup.py install
- # setup.py egg_info -> ?
-
- # we override different "build_py" commands for both environments
- if 'build_py' in cmds:
- _build_py = cmds['build_py']
- elif "setuptools" in sys.modules:
- from setuptools.command.build_py import build_py as _build_py
- else:
- from distutils.command.build_py import build_py as _build_py
-
- class cmd_build_py(_build_py):
- def run(self):
- root = get_root()
- cfg = get_config_from_root(root)
- versions = get_versions()
- _build_py.run(self)
- # now locate _version.py in the new build/ directory and replace
- # it with an updated value
- if cfg.versionfile_build:
- target_versionfile = os.path.join(self.build_lib,
- cfg.versionfile_build)
- print("UPDATING %s" % target_versionfile)
- write_to_version_file(target_versionfile, versions)
- cmds["build_py"] = cmd_build_py
-
- if 'build_ext' in cmds:
- _build_ext = cmds['build_ext']
- elif "setuptools" in sys.modules:
- from setuptools.command.build_ext import build_ext as _build_ext
- else:
- from distutils.command.build_ext import build_ext as _build_ext
-
- class cmd_build_ext(_build_ext):
- def run(self):
- root = get_root()
- cfg = get_config_from_root(root)
- versions = get_versions()
- _build_ext.run(self)
- if self.inplace:
- # build_ext --inplace will only build extensions in
- # build/lib<..> dir with no _version.py to write to.
- # As in place builds will already have a _version.py
- # in the module dir, we do not need to write one.
- return
- # now locate _version.py in the new build/ directory and replace
- # it with an updated value
- target_versionfile = os.path.join(self.build_lib,
- cfg.versionfile_build)
- print("UPDATING %s" % target_versionfile)
- write_to_version_file(target_versionfile, versions)
- cmds["build_ext"] = cmd_build_ext
-
- if "cx_Freeze" in sys.modules: # cx_freeze enabled?
- from cx_Freeze.dist import build_exe as _build_exe
- # nczeczulin reports that py2exe won't like the pep440-style string
- # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
- # setup(console=[{
- # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
- # "product_version": versioneer.get_version(),
- # ...
-
- class cmd_build_exe(_build_exe):
- def run(self):
- root = get_root()
- cfg = get_config_from_root(root)
- versions = get_versions()
- target_versionfile = cfg.versionfile_source
- print("UPDATING %s" % target_versionfile)
- write_to_version_file(target_versionfile, versions)
-
- _build_exe.run(self)
- os.unlink(target_versionfile)
- with open(cfg.versionfile_source, "w") as f:
- LONG = LONG_VERSION_PY[cfg.VCS]
- f.write(LONG %
- {"DOLLAR": "$",
- "STYLE": cfg.style,
- "TAG_PREFIX": cfg.tag_prefix,
- "PARENTDIR_PREFIX": cfg.parentdir_prefix,
- "VERSIONFILE_SOURCE": cfg.versionfile_source,
- })
- cmds["build_exe"] = cmd_build_exe
- del cmds["build_py"]
-
- if 'py2exe' in sys.modules: # py2exe enabled?
- from py2exe.distutils_buildexe import py2exe as _py2exe
-
- class cmd_py2exe(_py2exe):
- def run(self):
- root = get_root()
- cfg = get_config_from_root(root)
- versions = get_versions()
- target_versionfile = cfg.versionfile_source
- print("UPDATING %s" % target_versionfile)
- write_to_version_file(target_versionfile, versions)
-
- _py2exe.run(self)
- os.unlink(target_versionfile)
- with open(cfg.versionfile_source, "w") as f:
- LONG = LONG_VERSION_PY[cfg.VCS]
- f.write(LONG %
- {"DOLLAR": "$",
- "STYLE": cfg.style,
- "TAG_PREFIX": cfg.tag_prefix,
- "PARENTDIR_PREFIX": cfg.parentdir_prefix,
- "VERSIONFILE_SOURCE": cfg.versionfile_source,
- })
- cmds["py2exe"] = cmd_py2exe
-
- # we override different "sdist" commands for both environments
- if 'sdist' in cmds:
- _sdist = cmds['sdist']
- elif "setuptools" in sys.modules:
- from setuptools.command.sdist import sdist as _sdist
- else:
- from distutils.command.sdist import sdist as _sdist
-
- class cmd_sdist(_sdist):
- def run(self):
- versions = get_versions()
- self._versioneer_generated_versions = versions
- # unless we update this, the command will keep using the old
- # version
- self.distribution.metadata.version = versions["version"]
- return _sdist.run(self)
-
- def make_release_tree(self, base_dir, files):
- root = get_root()
- cfg = get_config_from_root(root)
- _sdist.make_release_tree(self, base_dir, files)
- # now locate _version.py in the new base_dir directory
- # (remembering that it may be a hardlink) and replace it with an
- # updated value
- target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
- print("UPDATING %s" % target_versionfile)
- write_to_version_file(target_versionfile,
- self._versioneer_generated_versions)
- cmds["sdist"] = cmd_sdist
-
- return cmds
-
-
-CONFIG_ERROR = """
-setup.cfg is missing the necessary Versioneer configuration. You need
-a section like:
-
- [versioneer]
- VCS = git
- style = pep440
- versionfile_source = src/myproject/_version.py
- versionfile_build = myproject/_version.py
- tag_prefix =
- parentdir_prefix = myproject-
-
-You will also need to edit your setup.py to use the results:
-
- import versioneer
- setup(version=versioneer.get_version(),
- cmdclass=versioneer.get_cmdclass(), ...)
-
-Please read the docstring in ./versioneer.py for configuration instructions,
-edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
-"""
-
-SAMPLE_CONFIG = """
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-
-[versioneer]
-#VCS = git
-#style = pep440
-#versionfile_source =
-#versionfile_build =
-#tag_prefix =
-#parentdir_prefix =
-
-"""
-
-OLD_SNIPPET = """
-from ._version import get_versions
-__version__ = get_versions()['version']
-del get_versions
-"""
-
-INIT_PY_SNIPPET = """
-from . import {0}
-__version__ = {0}.get_versions()['version']
-"""
-
-
-def do_setup():
- """Do main VCS-independent setup function for installing Versioneer."""
- root = get_root()
- try:
- cfg = get_config_from_root(root)
- except (OSError, configparser.NoSectionError,
- configparser.NoOptionError) as e:
- if isinstance(e, (OSError, configparser.NoSectionError)):
- print("Adding sample versioneer config to setup.cfg",
- file=sys.stderr)
- with open(os.path.join(root, "setup.cfg"), "a") as f:
- f.write(SAMPLE_CONFIG)
- print(CONFIG_ERROR, file=sys.stderr)
- return 1
-
- print(" creating %s" % cfg.versionfile_source)
- with open(cfg.versionfile_source, "w") as f:
- LONG = LONG_VERSION_PY[cfg.VCS]
- f.write(LONG % {"DOLLAR": "$",
- "STYLE": cfg.style,
- "TAG_PREFIX": cfg.tag_prefix,
- "PARENTDIR_PREFIX": cfg.parentdir_prefix,
- "VERSIONFILE_SOURCE": cfg.versionfile_source,
- })
-
- ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
- "__init__.py")
- if os.path.exists(ipy):
- try:
- with open(ipy, "r") as f:
- old = f.read()
- except OSError:
- old = ""
- module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0]
- snippet = INIT_PY_SNIPPET.format(module)
- if OLD_SNIPPET in old:
- print(" replacing boilerplate in %s" % ipy)
- with open(ipy, "w") as f:
- f.write(old.replace(OLD_SNIPPET, snippet))
- elif snippet not in old:
- print(" appending to %s" % ipy)
- with open(ipy, "a") as f:
- f.write(snippet)
- else:
- print(" %s unmodified" % ipy)
- else:
- print(" %s doesn't exist, ok" % ipy)
- ipy = None
-
- # Make sure both the top-level "versioneer.py" and versionfile_source
- # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
- # they'll be copied into source distributions. Pip won't be able to
- # install the package without this.
- manifest_in = os.path.join(root, "MANIFEST.in")
- simple_includes = set()
- try:
- with open(manifest_in, "r") as f:
- for line in f:
- if line.startswith("include "):
- for include in line.split()[1:]:
- simple_includes.add(include)
- except OSError:
- pass
- # That doesn't cover everything MANIFEST.in can do
- # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
- # it might give some false negatives. Appending redundant 'include'
- # lines is safe, though.
- if "versioneer.py" not in simple_includes:
- print(" appending 'versioneer.py' to MANIFEST.in")
- with open(manifest_in, "a") as f:
- f.write("include versioneer.py\n")
- else:
- print(" 'versioneer.py' already in MANIFEST.in")
- if cfg.versionfile_source not in simple_includes:
- print(" appending versionfile_source ('%s') to MANIFEST.in" %
- cfg.versionfile_source)
- with open(manifest_in, "a") as f:
- f.write("include %s\n" % cfg.versionfile_source)
- else:
- print(" versionfile_source already in MANIFEST.in")
-
- # Make VCS-specific changes. For git, this means creating/changing
- # .gitattributes to mark _version.py for export-subst keyword
- # substitution.
- do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
- return 0
-
-
-def scan_setup_py():
- """Validate the contents of setup.py against Versioneer's expectations."""
- found = set()
- setters = False
- errors = 0
- with open("setup.py", "r") as f:
- for line in f.readlines():
- if "import versioneer" in line:
- found.add("import")
- if "versioneer.get_cmdclass()" in line:
- found.add("cmdclass")
- if "versioneer.get_version()" in line:
- found.add("get_version")
- if "versioneer.VCS" in line:
- setters = True
- if "versioneer.versionfile_source" in line:
- setters = True
- if len(found) != 3:
- print("")
- print("Your setup.py appears to be missing some important items")
- print("(but I might be wrong). Please make sure it has something")
- print("roughly like the following:")
- print("")
- print(" import versioneer")
- print(" setup( version=versioneer.get_version(),")
- print(" cmdclass=versioneer.get_cmdclass(), ...)")
- print("")
- errors += 1
- if setters:
- print("You should remove lines like 'versioneer.VCS = ' and")
- print("'versioneer.versionfile_source = ' . This configuration")
- print("now lives in setup.cfg, and should be removed from setup.py")
- print("")
- errors += 1
- return errors
-
-
-if __name__ == "__main__":
- cmd = sys.argv[1]
- if cmd == "setup":
- errors = do_setup()
- errors += scan_setup_py()
- if errors:
- sys.exit(1)